diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39e299cb0..19398bccc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2922230d92982cec72f4ead04fb1da2af5301bef48f223a822fa4cf9696b9fcd model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -1879,15 +1879,19 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # --- only testing conc 8192 for now --- + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=8) + - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } + # --- 8k1k temporarily disabled for focused 1k1k testing --- + # - isl: 8192 + # osl: 1024 + # search-space: + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index ac552c733..f926ac732 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -71,23 +71,47 @@ MEM_FRACTION_STATIC=0.90 if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 + if [ "$CONC" = "8192" ]; then + # 1k1k high-concurrency mega_moe deepep recipe + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8224 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 1056 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 65536 + --tokenizer-worker-num 16 + --enable-prefill-delayer + --decode-log-interval 5 + ) + MAX_RUNNING_REQUESTS=8224 + MEM_FRACTION_STATIC=0.8 + SWA_FULL_TOKENS_RATIO=0.3 + else + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.94 + fi else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 @@ -111,7 +135,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0421c5596..6d3ae0f2a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1928,3 +1928,12 @@ - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 + + - config-keys: + - dsv4-fp4-b300-sglang + description: + - "1k1k conc=8192: mega_moe deepep backend with cuda-graph-max-bs 1056, max-running-requests 8224, mem 0.8, swa-ratio 0.3, tokenizer-workers 16" + - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 entries" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1207 + + \ No newline at end of file