diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..911cb503e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1670,7 +1670,7 @@ dsr1-fp4-b200-sglang: - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } dsv4-fp4-b200-sglang: - image: lmsysorg/sglang:deepseek-v4-blackwell + image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1690,7 +1690,7 @@ dsv4-fp4-b200-sglang: osl: 1024 search-space: # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } # balanced - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } # max-throughput @@ -1699,7 +1699,7 @@ dsv4-fp4-b200-sglang: osl: 1024 search-space: # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } # balanced - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } # max-throughput @@ -1838,7 +1838,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -1857,21 +1857,15 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index d455af3a3..e7a676b45 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -19,7 +20,13 @@ hf download "$MODEL" nvidia-smi +# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -30,7 +37,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -40,47 +47,41 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): -# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune -# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 -# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm +# script's pattern). DP-attention turns on EP-MoE (deepep) and the related +# mega_moe optimizations; single-instance uses flashinfer_mxfp4. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( +if [ "${DP_ATTENTION}" = "true" ]; then + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 128 + --chunked-prefill-size 32768 ) else - RECIPE=max-throughput - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 256 + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune ) fi -echo "Recipe: $RECIPE (CONC=$CONC)" + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -89,8 +90,10 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --disable-radix-cache \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --max-running-requests "$((CONC * 3 / 2))" \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.1 \ + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index faa946174..6fae10837 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -23,12 +24,13 @@ fi nvidia-smi +# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 - -# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its -# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half -# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. -unset CUDA_VISIBLE_DEVICES +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -40,7 +42,7 @@ unset CUDA_VISIBLE_DEVICES SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -50,47 +52,56 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): -# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune -# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 -# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was +# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. +if [[ "$ISL" == "1024" ]]; then + SWA_FULL_TOKENS_RATIO=0.5 +else + SWA_FULL_TOKENS_RATIO=0.1 +fi + +# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm +# script's pattern). DP-attention runs the empirically-tuned high-concurrency +# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer); +# single-instance uses flashinfer_mxfp4 with the cookbook defaults. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( +# Default; the DP-attn branch below overrides to 0.94. +MEM_FRACTION_STATIC=0.90 + +if [ "${DP_ATTENTION}" = "true" ]; then + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention - --moe-a2a-backend deepep + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 128 + --chunked-prefill-size 16384 + --enable-prefill-delayer ) + MEM_FRACTION_STATIC=0.94 else - RECIPE=max-throughput - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 256 + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune ) fi -echo "Recipe: $RECIPE (CONC=$CONC)" + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -99,8 +110,10 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --disable-radix-cache \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --max-running-requests "$((CONC * 3 / 2))" \ + --mem-fraction-static "$MEM_FRACTION_STATIC" \ + --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ed3c16ff..cab0d406e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1784,7 +1784,7 @@ description: - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - config-keys: - dsv4-fp8-mi355x-sglang