SemiAnalysisAI · Oseltamivir · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · claude
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -205,20 +205,23 @@ set -x
 
 BLOCK_SIZE=${BLOCK_SIZE:-16}
 # --enforce-eager is required: ROCm/ATOM#650 (PR1 skeleton) has no CUDAGraph
-# support yet (deferred to a follow-up PR). max-num-seqs uses the ATOM
-# default (512) — matches every other ATOM benchmark script in the repo.
-# The PR1 kv_cache[:1,...] hardcode in deepseek_v4.py means any forward
-# with batch>1 silently corrupts non-slot-0 lanes; this risk activates
-# whenever the scheduler assembles batch>1, regardless of the explicit
-# max-num-seqs value, so pinning it to 4 (the PR's offline repro value)
-# offered no protective benefit. eval (gsm8k) at conc>1 is the canary.
+# support yet (deferred to a follow-up PR). max-num-seqs is sized to the
+# client concurrency with a floor at 4 — the ATOM default (512) makes the
+# KV/GDN-mamba allocator overshoot the GPU budget ("GDN mamba tensor
+# exceeds available KV budget"), and using 1 hangs warmup at 0% GPU. 4
+# is the minimum we've seen complete warmup successfully (also the PR's
+# offline repro value). The PR1 kv_cache[:1,...] hardcode in
+# deepseek_v4.py means any forward with batch>1 silently corrupts
+# non-slot-0 lanes; eval (gsm8k) at conc>1 is the canary.
+MAX_NUM_SEQS=$(( CONC < 4 ? 4 : CONC ))
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
     -tp $TP \
     --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
     --block-size $BLOCK_SIZE \
     --enforce-eager \
+    --max-num-seqs $MAX_NUM_SEQS \
     --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1844,3 +1844,14 @@
     - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
     - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
+    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
+    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
+    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170