diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3a7ba3df1..616aa6ff9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2501,17 +2501,15 @@ dsv4-fp4-b300-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 4 } - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 128, conc-end: 128 } - - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 4096 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 8192 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 128, conc-end: 128 } - - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 1024 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 4096 } qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh index 52f38c4d9..6bb5b9049 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh @@ -1,9 +1,8 @@ #!/usr/bin/env bash # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 -# pareto sweep. The matrix uses dp-attn=true as the existing switch to flip a -# 4-GPU run from TP4 to DP4. Expert parallel is always enabled to match the -# provided vllm serve command exactly. +# pareto sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode +# (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size). source "$(dirname "$0")/../benchmark_lib.sh" @@ -38,6 +37,17 @@ if [ "${DP_ATTENTION}" = "true" ]; then PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") fi +EP_ARGS=() +if [ "${EP_SIZE:-1}" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +if [ "${DP_ATTENTION}" = "true" ]; then + MAX_NUM_BATCHED_TOKENS=2048 +else + MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 )) +fi + BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then BENCHMARK_MAX_MODEL_LEN=4096 @@ -62,7 +72,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --trust-remote-code \ --block-size 256 \ --no-enable-prefix-caching \ - --enable-expert-parallel \ + "${EP_ARGS[@]}" \ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ --attention_config.use_fp4_indexer_cache True \ --tokenizer-mode deepseek_v4 \ @@ -71,7 +81,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ --reasoning-parser deepseek_v4 \ --max-cudagraph-capture-size 2048 \ --max-model-len "$SERVE_MAX_MODEL_LEN" \ - --max-num-batched-tokens 2048 > "$SERVER_LOG" 2>&1 & + --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e4c46268e..300d39c40 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1894,3 +1894,12 @@ - "better performance for dp-attention" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - "Update search space based on B300 pareto sweep results" + - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192" + - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 +