diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 616aa6ff9..ccd6b6e2b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1832,40 +1832,33 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# lists B200 (not B300) as the Blackwell target. This config reuses the -# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 -# until a B300-specific recipe ships. Prefix caching is disabled. -# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. +# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while +# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro +# on B300. Re-introduce balanced/max-throughput rows once fixed upstream. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:deepseek-v4-b300 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 framework: sglang multinode: false - # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 - # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC: - # low-latency (CONC <= 32): TP-only - # balanced (32 < CONC <= 128): + DP-attn - # max-throughput (CONC > 128): + DP-attn - # Split so result filenames (ep=, dpa=) accurately reflect the recipe. - # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, - # while low-latency leaves ep_size at the default of 1. + # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) + # while the DeepEP FP8 weight-postprocess path is broken for this + # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 + # integers. raised from sglang.srt.layers.quantization.fp8 + # .process_weights_after_loading_block_quant). Full concurrency sweep + # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300 + # once sglang can load the checkpoint under --moe-a2a-backend deepep. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index ac552c733..c9fb238a5 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -5,7 +5,6 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ - DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -24,13 +23,12 @@ fi nvidia-smi -# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + +# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its +# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half +# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. +unset CUDA_VISIBLE_DEVICES # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -42,7 +40,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -52,57 +50,21 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was -# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. -if [[ "$ISL" == "1024" ]]; then - SWA_FULL_TOKENS_RATIO=0.5 -else - SWA_FULL_TOKENS_RATIO=0.1 -fi - -# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm -# script's pattern). DP-attention runs the empirically-tuned high-concurrency -# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer); -# single-instance uses flashinfer_mxfp4 with the cookbook defaults. -DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - -# Default; the DP-attn branch below overrides to 0.94. -MEM_FRACTION_STATIC=0.90 - -if [ "${DP_ATTENTION}" = "true" ]; then - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 -else - PARALLEL_ARGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 - --disable-flashinfer-autotune - ) -fi - -# Print all SGLANG_* env vars to both the CI step log and server.log so the -# launch config is auditable from the result artifact alone. -{ - echo "=== SGLANG_* env vars at launch ===" - env | grep -E '^SGLANG_' | sort - echo "===================================" -} | tee "$SERVER_LOG" +# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the +# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 +# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from +# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). +# Restore the CONC-based low-latency / balanced / max-throughput dispatch +# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under +# --moe-a2a-backend deepep. +RECIPE=low-latency +RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 +) +echo "Recipe: $RECIPE (CONC=$CONC)" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -111,10 +73,8 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ - --mem-fraction-static "$MEM_FRACTION_STATIC" \ - --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ - "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 300d39c40..89f0a7aea 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1779,13 +1779,6 @@ - "Prefix caching and speculative decoding disabled for baseline numbers" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - - config-keys: - dsv4-fp8-mi355x-sglang description: @@ -1856,26 +1849,6 @@ - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170 -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "better performance for dp-attention" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174 - - config-keys: - dsv4-fp4-b300-sglang-mtp description: @@ -1888,13 +1861,6 @@ - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "better performance for dp-attention" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178 - - config-keys: - dsv4-fp4-b300-vllm description: