diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index dededd071..ac552c733 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -70,6 +70,7 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} MEM_FRACTION_STATIC=0.90 if [ "${DP_ATTENTION}" = "true" ]; then + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 77c2dd31e..589f75766 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1868,3 +1868,10 @@ description: - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "better performance for dp-attention" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174