diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ccd6b6e2b..3221a2052 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1670,7 +1670,7 @@ dsr1-fp4-b200-sglang: - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } dsv4-fp4-b200-sglang: - image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b + image: lmsysorg/sglang:deepseek-v4-blackwell model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1690,7 +1690,7 @@ dsv4-fp4-b200-sglang: osl: 1024 search-space: # low-latency - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } # balanced - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } # max-throughput @@ -1699,7 +1699,7 @@ dsv4-fp4-b200-sglang: osl: 1024 search-space: # low-latency - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } # balanced - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } # max-throughput diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index e7a676b45..d455af3a3 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -5,7 +5,6 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ - DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -20,13 +19,7 @@ hf download "$MODEL" nvidia-smi -# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -37,7 +30,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -47,41 +40,47 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm -# script's pattern). DP-attention turns on EP-MoE (deepep) and the related -# mega_moe optimizations; single-instance uses flashinfer_mxfp4. +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -if [ "${DP_ATTENTION}" = "true" ]; then - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 32768 + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 ) else - PARALLEL_ARGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 - --disable-flashinfer-autotune + RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 256 ) fi - -# Print all SGLANG_* env vars to both the CI step log and server.log so the -# launch config is auditable from the result artifact alone. -{ - echo "=== SGLANG_* env vars at launch ===" - env | grep -E '^SGLANG_' | sort - echo "===================================" -} | tee "$SERVER_LOG" +echo "Recipe: $RECIPE (CONC=$CONC)" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -90,10 +89,8 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$((CONC * 3 / 2))" \ - --mem-fraction-static 0.90 \ - --swa-full-tokens-ratio 0.1 \ - "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$!