From 3a2e45962ae0f8d8eb6acd3a93186b68930797a6 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 15:55:14 -0700 Subject: [PATCH 01/11] fix sgl b200/b300 script --- benchmarks/single_node/dsv4_fp4_b200.sh | 54 ++++++++++++++--- .../single_node/dsv4_fp4_b300_sglang.sh | 59 ++++++++++++++----- 2 files changed, 90 insertions(+), 23 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index d455af3a3..58d22f890 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -49,35 +49,71 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} if [[ $CONC -le 32 ]]; then RECIPE=low-latency + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 RECIPE_FLAGS=( --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 + --chunked-prefill-size 8192 --disable-flashinfer-autotune - --mem-fraction-static 0.82 + --mem-fraction-static 0.90 + --max-running-requests 32 + --swa-full-tokens-ratio 0.1 ) elif [[ $CONC -le 128 ]]; then RECIPE=balanced - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + # MoE EP related flags + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 + --mem-fraction-static 0.83 --max-running-requests 128 + --chunked-prefill-size 32768 + --swa-full-tokens-ratio 0.1 ) else RECIPE=max-throughput - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + # MoE EP related flags + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 256 + --mem-fraction-static 0.90 + --max-running-requests 512 + --chunked-prefill-size 32768 + --swa-full-tokens-ratio 0.1 ) fi echo "Recipe: $RECIPE (CONC=$CONC)" diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index faa946174..313de980c 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -25,11 +25,6 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its -# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half -# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. -unset CUDA_VISIBLE_DEVICES - # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. # The runner mounts our repo at a non-/workspace path for these images so the @@ -59,35 +54,71 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} if [[ $CONC -le 32 ]]; then RECIPE=low-latency + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 RECIPE_FLAGS=( --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 + --chunked-prefill-size 8192 --disable-flashinfer-autotune - --mem-fraction-static 0.82 + --mem-fraction-static 0.90 + --max-running-requests 32 + --swa-full-tokens-ratio 0.1 ) elif [[ $CONC -le 128 ]]; then RECIPE=balanced - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + # MoE EP related flags + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 + --mem-fraction-static 0.83 --max-running-requests 128 + --chunked-prefill-size 32768 + --swa-full-tokens-ratio 0.1 ) else RECIPE=max-throughput - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + # MoE EP related flags + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 256 + --mem-fraction-static 0.90 + --max-running-requests 512 + --chunked-prefill-size 32768 + --swa-full-tokens-ratio 0.1 ) fi echo "Recipe: $RECIPE (CONC=$CONC)" From 3141900d7bbe9edb7ae9ca55f283a6f22b7dab45 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 16:12:46 -0700 Subject: [PATCH 02/11] ping docker image --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..3e12cc1fd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1670,7 +1670,7 @@ dsr1-fp4-b200-sglang: - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } dsv4-fp4-b200-sglang: - image: lmsysorg/sglang:deepseek-v4-blackwell + image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -1838,7 +1838,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From ef4199594da7903a9114fcc84206e15e1ebcbc70 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 17:50:00 -0700 Subject: [PATCH 03/11] fix --- .github/configs/nvidia-master.yaml | 24 ++++++++++++++----- benchmarks/single_node/dsv4_fp4_b200.sh | 1 - .../single_node/dsv4_fp4_b300_sglang.sh | 1 - 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3e12cc1fd..932a8e042 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1857,21 +1857,33 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - # low-latency + # low-latency (tp=8 single-instance) - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced + # balanced (tp=8 DP-attn) - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput + # max-throughput (tp=8 DP-attn) - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + # low-latency (tp=4 single-instance, mirrors b300 vllm pareto sweep) + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } + # balanced (tp=4 DP-attn -> DP4 on 4 GPUs) + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput (tp=4 DP-attn -> DP4 on 4 GPUs) + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - # low-latency + # low-latency (tp=8 single-instance) - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced + # balanced (tp=8 DP-attn) - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput + # max-throughput (tp=8 DP-attn) - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } + # low-latency (tp=4 single-instance) + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } + # balanced (tp=4 DP-attn -> DP4 on 4 GPUs) + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput (tp=4 DP-attn -> DP4 on 4 GPUs) + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 58d22f890..7d81ffc3e 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -125,7 +125,6 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --disable-radix-cache \ "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 313de980c..0cced8a1f 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -130,7 +130,6 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --disable-radix-cache \ "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From 7d107e8f13c833e380acf1b7dc035935cbd0bb30 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 18:32:54 -0700 Subject: [PATCH 04/11] fix --- .github/configs/nvidia-master.yaml | 26 +----- benchmarks/single_node/dsv4_fp4_b200.sh | 90 ++++++------------- .../single_node/dsv4_fp4_b300_sglang.sh | 90 ++++++------------- 3 files changed, 62 insertions(+), 144 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 932a8e042..802b1566a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1857,33 +1857,15 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - # low-latency (tp=8 single-instance) - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced (tp=8 DP-attn) - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput (tp=8 DP-attn) - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - # low-latency (tp=4 single-instance, mirrors b300 vllm pareto sweep) - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } - # balanced (tp=4 DP-attn -> DP4 on 4 GPUs) - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput (tp=4 DP-attn -> DP4 on 4 GPUs) - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - # low-latency (tp=8 single-instance) - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced (tp=8 DP-attn) - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput (tp=8 DP-attn) - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } - # low-latency (tp=4 single-instance) - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } - # balanced (tp=4 DP-attn -> DP4 on 4 GPUs) - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput (tp=4 DP-attn -> DP4 on 4 GPUs) - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 7d81ffc3e..e7a676b45 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -19,7 +20,13 @@ hf download "$MODEL" nvidia-smi +# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -30,7 +37,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -40,38 +47,12 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): -# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune -# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 -# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm +# script's pattern). DP-attention turns on EP-MoE (deepep) and the related +# mega_moe optimizations; single-instance uses flashinfer_mxfp4. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 - --disable-flashinfer-autotune - --mem-fraction-static 0.90 - --max-running-requests 32 - --swa-full-tokens-ratio 0.1 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - # MoE EP related flags +if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 export SGLANG_OPT_USE_FAST_MASK_EP=1 @@ -79,44 +60,28 @@ elif [[ $CONC -le 128 ]]; then export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - RECIPE_FLAGS=( + PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.83 - --max-running-requests 128 --chunked-prefill-size 32768 - --swa-full-tokens-ratio 0.1 ) else - RECIPE=max-throughput - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - # MoE EP related flags - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.90 - --max-running-requests 512 - --chunked-prefill-size 32768 - --swa-full-tokens-ratio 0.1 + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune ) fi -echo "Recipe: $RECIPE (CONC=$CONC)" + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -125,7 +90,10 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --max-running-requests "$((CONC * 3 / 2))" \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.1 \ + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 0cced8a1f..0e32c0260 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -23,7 +24,13 @@ fi nvidia-smi +# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -35,7 +42,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -45,38 +52,12 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): -# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune -# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 -# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm +# script's pattern). DP-attention turns on EP-MoE (deepep) and the related +# mega_moe optimizations; single-instance uses flashinfer_mxfp4. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 - --disable-flashinfer-autotune - --mem-fraction-static 0.90 - --max-running-requests 32 - --swa-full-tokens-ratio 0.1 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - # MoE EP related flags +if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 export SGLANG_OPT_USE_FAST_MASK_EP=1 @@ -84,44 +65,28 @@ elif [[ $CONC -le 128 ]]; then export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - RECIPE_FLAGS=( + PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.83 - --max-running-requests 128 --chunked-prefill-size 32768 - --swa-full-tokens-ratio 0.1 ) else - RECIPE=max-throughput - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - # MoE EP related flags - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.90 - --max-running-requests 512 - --chunked-prefill-size 32768 - --swa-full-tokens-ratio 0.1 + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune ) fi -echo "Recipe: $RECIPE (CONC=$CONC)" + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -130,7 +95,10 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --max-running-requests "$((CONC * 3 / 2))" \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.1 \ + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! From 485cb8ef1f57f3ee27ec19f79637ab40e3f3b650 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 18:39:17 -0700 Subject: [PATCH 05/11] fix --- .github/configs/nvidia-master.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 802b1566a..e9ea81eaf 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1690,7 +1690,7 @@ dsv4-fp4-b200-sglang: osl: 1024 search-space: # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } # balanced - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } # max-throughput @@ -1699,7 +1699,7 @@ dsv4-fp4-b200-sglang: osl: 1024 search-space: # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } # balanced - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } # max-throughput @@ -1857,14 +1857,14 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 64 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512 } qwen3.5-bf16-b200-sglang: From 7a4d41550ae8f909856ab5bbad99f8c5adee6b63 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 21:38:13 -0700 Subject: [PATCH 06/11] tune swa-full-tokens-ratio --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 0e32c0260..89ab34e40 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -52,6 +52,14 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" +# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was +# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. +if [[ "$ISL" == "1024" ]]; then + SWA_FULL_TOKENS_RATIO=0.5 +else + SWA_FULL_TOKENS_RATIO=0.1 +fi + # Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm # script's pattern). DP-attention turns on EP-MoE (deepep) and the related # mega_moe optimizations; single-instance uses flashinfer_mxfp4. @@ -97,7 +105,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --tp $TP \ --max-running-requests "$((CONC * 3 / 2))" \ --mem-fraction-static 0.90 \ - --swa-full-tokens-ratio 0.1 \ + --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! From 0f975c895b70dc1e30b7eb5394834361acb72e54 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 22:03:07 -0700 Subject: [PATCH 07/11] high concurrency --- .github/configs/nvidia-master.yaml | 2 +- .../single_node/dsv4_fp4_b300_sglang.sh | 31 +++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e9ea81eaf..acc2b6ff2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1838,7 +1838,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 89ab34e40..fea14253c 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -65,7 +65,34 @@ fi # mega_moe optimizations; single-instance uses flashinfer_mxfp4. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -if [ "${DP_ATTENTION}" = "true" ]; then +# Default; the CONC=512 DP-attn branch below overrides to 0.94. +MEM_FRACTION_STATIC=0.90 + +if [[ "$CONC" == "512" ]] && [ "${DP_ATTENTION}" = "true" ]; then + # Empirically tuned recipe for the highest-concurrency DP-attn point. + # Note vs the standard DP-attn path: deepgemm + hash_mega_moe disabled, + # flashinfer_mxfp4 used as the runner backend, prefill chunks halved to + # 16384, prefill-delayer turned on, mem fraction bumped to 0.94. + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.94 + # Override the ISL=1024 → 0.5 default; this recipe runs SWA at 0.1. + SWA_FULL_TOKENS_RATIO=0.1 +elif [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 export SGLANG_OPT_USE_FAST_MASK_EP=1 @@ -104,7 +131,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --trust-remote-code \ --tp $TP \ --max-running-requests "$((CONC * 3 / 2))" \ - --mem-fraction-static 0.90 \ + --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & From 68e40f8c680d8fac77e80ead5f11429aec2f85ee Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sat, 25 Apr 2026 22:21:15 -0700 Subject: [PATCH 08/11] change perflog --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ed3c16ff..cab0d406e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1784,7 +1784,7 @@ description: - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - config-keys: - dsv4-fp8-mi355x-sglang From e8e4810e83e09f0a77659b5ae227e5620b0d075f Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sun, 26 Apr 2026 00:59:17 -0700 Subject: [PATCH 09/11] upd --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index fea14253c..3cca98645 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -90,8 +90,6 @@ if [[ "$CONC" == "512" ]] && [ "${DP_ATTENTION}" = "true" ]; then --enable-prefill-delayer ) MEM_FRACTION_STATIC=0.94 - # Override the ISL=1024 → 0.5 default; this recipe runs SWA at 0.1. - SWA_FULL_TOKENS_RATIO=0.1 elif [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 From cbde82daae9fd8e7f1c397501e43332f35d65522 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sun, 26 Apr 2026 01:56:31 -0700 Subject: [PATCH 10/11] fix hang --- .../single_node/dsv4_fp4_b300_sglang.sh | 28 ++++--------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 3cca98645..6fae10837 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -61,18 +61,15 @@ else fi # Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm -# script's pattern). DP-attention turns on EP-MoE (deepep) and the related -# mega_moe optimizations; single-instance uses flashinfer_mxfp4. +# script's pattern). DP-attention runs the empirically-tuned high-concurrency +# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer); +# single-instance uses flashinfer_mxfp4 with the cookbook defaults. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# Default; the CONC=512 DP-attn branch below overrides to 0.94. +# Default; the DP-attn branch below overrides to 0.94. MEM_FRACTION_STATIC=0.90 -if [[ "$CONC" == "512" ]] && [ "${DP_ATTENTION}" = "true" ]; then - # Empirically tuned recipe for the highest-concurrency DP-attn point. - # Note vs the standard DP-attn path: deepgemm + hash_mega_moe disabled, - # flashinfer_mxfp4 used as the runner backend, prefill chunks halved to - # 16384, prefill-delayer turned on, mem fraction bumped to 0.94. +if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 @@ -90,21 +87,6 @@ if [[ "$CONC" == "512" ]] && [ "${DP_ATTENTION}" = "true" ]; then --enable-prefill-delayer ) MEM_FRACTION_STATIC=0.94 -elif [ "${DP_ATTENTION}" = "true" ]; then - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 32768 - ) else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 From 7273032d8f7ae505825d4bcb12fa6b9080998387 Mon Sep 17 00:00:00 2001 From: Qiaolin-Yu Date: Sun, 26 Apr 2026 02:10:10 -0700 Subject: [PATCH 11/11] remove useless points --- .github/configs/nvidia-master.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index acc2b6ff2..911cb503e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1857,15 +1857,15 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e