diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ccd6b6e2b..5978a09e2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1832,33 +1832,40 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while -# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro -# on B300. Re-introduce balanced/max-throughput rows once fixed upstream. +# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# lists B200 (not B300) as the Blackwell target. This config reuses the +# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 +# until a B300-specific recipe ships. Prefix caching is disabled. +# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 framework: sglang multinode: false - # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) - # while the DeepEP FP8 weight-postprocess path is broken for this - # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 - # integers. raised from sglang.srt.layers.quantization.fp8 - # .process_weights_after_loading_block_quant). Full concurrency sweep - # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300 - # once sglang can load the checkpoint under --moe-a2a-backend deepep. + # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 + # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC: + # low-latency (CONC <= 32): TP-only + # balanced (32 < CONC <= 128): + DP-attn + # max-throughput (CONC > 128): + DP-attn + # Split so result filenames (ep=, dpa=) accurately reflect the recipe. + # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # while low-latency leaves ep_size at the default of 1. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index c9fb238a5..ac552c733 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -23,12 +24,13 @@ fi nvidia-smi +# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 - -# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its -# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half -# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. -unset CUDA_VISIBLE_DEVICES +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -40,7 +42,7 @@ unset CUDA_VISIBLE_DEVICES SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -50,21 +52,57 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the -# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 -# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from -# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). -# Restore the CONC-based low-latency / balanced / max-throughput dispatch -# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under -# --moe-a2a-backend deepep. -RECIPE=low-latency -RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 -) -echo "Recipe: $RECIPE (CONC=$CONC)" +# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was +# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. +if [[ "$ISL" == "1024" ]]; then + SWA_FULL_TOKENS_RATIO=0.5 +else + SWA_FULL_TOKENS_RATIO=0.1 +fi + +# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm +# script's pattern). DP-attention runs the empirically-tuned high-concurrency +# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer); +# single-instance uses flashinfer_mxfp4 with the cookbook defaults. +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +# Default; the DP-attn branch below overrides to 0.94. +MEM_FRACTION_STATIC=0.90 + +if [ "${DP_ATTENTION}" = "true" ]; then + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.94 +else + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune + ) +fi + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -73,8 +111,10 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --disable-radix-cache \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --mem-fraction-static "$MEM_FRACTION_STATIC" \ + --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 89f0a7aea..accb9a583 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1869,3 +1869,12 @@ - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" + - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 +