diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5bc61f53a..eaf504f63 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1569,7 +1569,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=1" dsv4-fp8-mi355x-sglang: - image: rocm/sgl-dev:deepseek-v4-mi35x + image: rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 model: sgl-project/DeepSeek-V4-Pro-FP8 model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/dsv4_fp8_mi355x.sh b/benchmarks/single_node/dsv4_fp8_mi355x.sh index 971b18b6a..8fe26d778 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x.sh @@ -39,13 +39,14 @@ else: print(f"No patch needed: model_type is {config.get('model_type')!r}") PYEOF -# DSv4-specific SGLang env vars (from sgl-project/sglang#23608) +export SGLANG_REASONING_EFFORT=max export SGLANG_OPT_USE_FUSED_COMPRESS=false export SGLANG_OPT_USE_OLD_COMPRESSOR=true export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false export SGLANG_OPT_USE_FUSED_HASH_TOPK=false -export SGLANG_HACK_FLASHMLA_BACKEND=torch +export SGLANG_HACK_FLASHMLA_BACKEND=tilelang +export SGLANG_OPT_USE_TILELANG_INDEXER=true export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false export SGLANG_OPT_USE_TILELANG_MHC_PRE=false export SGLANG_OPT_USE_TILELANG_MHC_POST=false @@ -85,7 +86,6 @@ python3 -m sglang.launch_server \ --page-size 256 \ --chunked-prefill-size 8192 \ --disable-shared-experts-fusion \ - --disable-cuda-graph \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 647ec35f9..a959aff1b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2070,6 +2070,18 @@ - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242 +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Bump dsv4-fp8-mi355x-sglang image to rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 (sgl-project/sglang amd/deepseek_v4 integration through 6/N + ENV-set commit c924543)" + - "Switch SGLANG_HACK_FLASHMLA_BACKEND from torch to tilelang (sgl-project/sglang#24033, FlashMLA 101->2 kernels per call)" + - "Add SGLANG_OPT_USE_TILELANG_INDEXER=true (sgl-project/sglang#24050, fp8 paged-MQA-logits indexer 12->1 kernels per call)" + - "Drop --disable-cuda-graph from sglang.launch_server (CUDA graph for DSv4 on ROCm/HIP enabled by sgl-project/sglang#23832)" + - "Keep SGLANG_TOPK_TRANSFORM_512_TORCH=1 for now: sgl-project/sglang#24143 (topk512 native ROCm kernel) merged 4-30 21:31 UTC, after the c924543 image was built (4-30 08:26 UTC); will flip to 0 once a newer daily image lands" + - "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)" + - "Expected speedup over the previous PR #23608 day-0 torch-fallback recipe: ~5.4-5.8x at conc 1-8 (matches the '+ indexer tilelang attn' tier in the AMD DSv4-Flash-FP8 reference table)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1255 + - config-keys: - glm5-fp8-mi355x-sglang-mtp description: