From 1730de5e0160e1d6b0c7ed23740574f983679acf Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Fri, 1 May 2026 15:11:05 +0000
Subject: [PATCH 1/2] [AMD] dsv4-fp8-mi355x-sglang

- bump to c924543 daily image
- enable TileLang attn/indexer + cuda graph
---
 .github/configs/amd-master.yaml           |  2 +-
 benchmarks/single_node/dsv4_fp8_mi355x.sh |  6 +++---
 perf-changelog.yaml                       | 12 ++++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 893210ef6..63b9236bf 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1519,7 +1519,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsv4-fp8-mi355x-sglang:
-  image: rocm/sgl-dev:deepseek-v4-mi35x
+  image: rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4
   model: sgl-project/DeepSeek-V4-Pro-FP8
   model-prefix: dsv4
   runner: mi355x
diff --git a/benchmarks/single_node/dsv4_fp8_mi355x.sh b/benchmarks/single_node/dsv4_fp8_mi355x.sh
index 971b18b6a..8fe26d778 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x.sh
@@ -39,13 +39,14 @@ else:
     print(f"No patch needed: model_type is {config.get('model_type')!r}")
 PYEOF
 
-# DSv4-specific SGLang env vars (from sgl-project/sglang#23608)
+export SGLANG_REASONING_EFFORT=max
 export SGLANG_OPT_USE_FUSED_COMPRESS=false
 export SGLANG_OPT_USE_OLD_COMPRESSOR=true
 export SGLANG_OPT_USE_TILELANG_SWA_PREPARE=false
 export SGLANG_OPT_USE_JIT_KERNEL_FUSED_TOPK=false
 export SGLANG_OPT_USE_FUSED_HASH_TOPK=false
-export SGLANG_HACK_FLASHMLA_BACKEND=torch
+export SGLANG_HACK_FLASHMLA_BACKEND=tilelang
+export SGLANG_OPT_USE_TILELANG_INDEXER=true
 export SGLANG_OPT_DEEPGEMM_HC_PRENORM=false
 export SGLANG_OPT_USE_TILELANG_MHC_PRE=false
 export SGLANG_OPT_USE_TILELANG_MHC_POST=false
@@ -85,7 +86,6 @@ python3 -m sglang.launch_server \
     --page-size 256 \
     --chunked-prefill-size 8192 \
     --disable-shared-experts-fusion \
-    --disable-cuda-graph \
     --tool-call-parser deepseekv4 \
     --reasoning-parser deepseek-v4 \
     --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0403c2385..211c346db 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2069,3 +2069,15 @@
     - "Recipes cover 8k/1k aggregate TP8 low-latency conc=1, low-latency bridge 1P DEP8 + 4D TP8 no-offload conc=16/32/64, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024"
     - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Bump dsv4-fp8-mi355x-sglang image to rocm/sgl-dev:rocm720-mi35x-c924543-20260430-DSv4 (sgl-project/sglang amd/deepseek_v4 integration through 6/N + ENV-set commit c924543)"
+    - "Switch SGLANG_HACK_FLASHMLA_BACKEND from torch to tilelang (sgl-project/sglang#24033, FlashMLA 101->2 kernels per call)"
+    - "Add SGLANG_OPT_USE_TILELANG_INDEXER=true (sgl-project/sglang#24050, fp8 paged-MQA-logits indexer 12->1 kernels per call)"
+    - "Drop --disable-cuda-graph from sglang.launch_server (CUDA graph for DSv4 on ROCm/HIP enabled by sgl-project/sglang#23832)"
+    - "Keep SGLANG_TOPK_TRANSFORM_512_TORCH=1 for now: sgl-project/sglang#24143 (topk512 native ROCm kernel) merged 4-30 21:31 UTC, after the c924543 image was built (4-30 08:26 UTC); will flip to 0 once a newer daily image lands"
+    - "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)"
+    - "Expected speedup over the previous PR #23608 day-0 torch-fallback recipe: ~5.4-5.8x at conc 1-8 (matches the '+ indexer tilelang attn' tier in the AMD DSv4-Flash-FP8 reference table)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/Placeholder

From d6dd2f7a6d87da521fd9f2c67dd20352a70833f4 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Fri, 1 May 2026 15:14:49 +0000
Subject: [PATCH 2/2] Update Perf Changelog

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 211c346db..53b0b0ae9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2080,4 +2080,4 @@
     - "Keep SGLANG_TOPK_TRANSFORM_512_TORCH=1 for now: sgl-project/sglang#24143 (topk512 native ROCm kernel) merged 4-30 21:31 UTC, after the c924543 image was built (4-30 08:26 UTC); will flip to 0 once a newer daily image lands"
     - "Keep SGLANG_DSV4_FP4_EXPERTS=false and SGLANG_FORCE_TRITON_MOE_FP8=1: required for sgl-project/DeepSeek-V4-Pro-FP8 (FP4 path asserts intermediate_size_per_partition==2048 in fp8.py; swiglu_limit clamp lives in fused_moe_triton)"
     - "Expected speedup over the previous PR #23608 day-0 torch-fallback recipe: ~5.4-5.8x at conc 1-8 (matches the '+ indexer tilelang attn' tier in the AMD DSv4-Flash-FP8 reference table)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/Placeholder
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1255