SemiAnalysisAI · Oseltamivir · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
@@ -1489,3 +1489,34 @@ dsv4-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+
+# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
+# PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
+# hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on
+# gfx950. Image is the standard atom0.1.2.post MI355X base (matching
+# qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by
+# benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep
+# will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land.
+dsv4-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+    - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 }
+    - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+    - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 }
+    - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 }
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -98,7 +98,7 @@ permissions:
 jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
-    timeout-minutes: 300
+    timeout-minutes: 500
     name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -0,0 +1,250 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE"
+
+# EP_SIZE > 1 is still unvalidated by PR #650's repro (offline TP=8 EP=1
+# only). Keep the EP guard. The CONC guard was relaxed to empirically
+# probe whether kv_cache[:1,...] in deepseek_v4.py actually corrupts at
+# batch>1 in the server path: max-num-seqs=4 caps the running batch
+# below the YAML's max conc (32), and per-sequence eval correctness will
+# tell us if the hardcode bites. If gsm8k accuracy collapses at conc>1,
+# put `if [ "$CONC" -ne 1 ]; then exit 1` back.
+if [ "$EP_SIZE" -ne 1 ]; then
+    echo "FATAL: ROCm/ATOM#650 PR1 has not validated expert parallel serving; EP_SIZE must be 1, got $EP_SIZE" >&2
+    exit 1
+fi
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+export OMP_NUM_THREADS=1
+
+# DSv4-specific ATOM env vars (from ROCm/ATOM#650 repro command).
+# The aiter fused_moe path is broken on gfx950 with a16w4+Swiglu, so PR1
+# requires the triton matmul_ogs path. AITER_LOG_LEVEL quiets the noisy
+# warmup logs that otherwise drown out the server-ready signal.
+export ATOM_USE_TRITON_MOE=1
+export AITER_LOG_LEVEL=WARNING
+
+# Apply ROCm/ATOM#650 (DSv4 PR1 skeleton) over the image's wheel-installed
+# atom. The chosen base image ships atom as a built wheel, not editable, so
+# we overlay an editable install from the PR branch at a pinned SHA. Bump
+# this SHA when the PR moves; do not track the branch tip (the run becomes
+# a moving target if the branch is force-pushed).
+ATOM_PR_SHA="cdbff359d3db7afd3801e28b38fc71253121ee84"
+export ATOM_PR_DIR="/tmp/atom-pr650"
+
+if [ ! -d "$ATOM_PR_DIR/.git" ]; then
+    git clone --filter=blob:none https://github.com/ROCm/ATOM.git "$ATOM_PR_DIR"
+fi
+(
+    cd "$ATOM_PR_DIR"
+    # Try a targeted fetch first (fast); fall back to fetching the PR ref if
+    # the server doesn't allow fetching the SHA directly.
+    git fetch --depth=1 origin "$ATOM_PR_SHA" 2>/dev/null \
+        || git fetch --depth=1 origin pull/650/head
+    git checkout --force "$ATOM_PR_SHA"
+    test "$(git rev-parse HEAD)" = "$ATOM_PR_SHA"
+
+    # WORKAROUND: PR #650 has no env-var toggle to disable the aiter MHC
+    # kernels, and on this image aiter's `mhc_pre_big_fuse` crashes with a
+    # HIPGuardImplMasqueradingAsCUDA INTERNAL ASSERT the first time the
+    # model executes the hc_pre path during prefill (a HIP/CUDA device-type
+    # mismatch inside aiter, not something we can fix from outside). SGLang's
+    # DSv4 recipe disables the same family explicitly
+    # (SGLANG_OPT_USE_TILELANG_MHC_PRE/POST=false, _DEEPGEMM_HC_PRENORM=false).
+    # Force only `mhc_pre` to torch-fallback; leave `mhc_post` on the aiter
+    # path since the crash stack only implicated mhc_pre and we'd like to
+    # recover the perf of half the MHC pipeline. If mhc_post crashes too on
+    # the next run, add the second sed back.
+    sed -i 's|mhc_pre = getattr(_aiter, "mhc_pre", None)|mhc_pre = None  # patched out (HIP device-guard crash)|' atom/models/deepseek_v4.py
+    grep -c "patched out" atom/models/deepseek_v4.py | grep -q '^1$' \
+        || { echo "FATAL: mhc_pre sed patch did not apply"; exit 1; }
+
+    # --no-deps: don't churn the image's pinned ROCm/torch/triton/aiter.
+    # --force-reinstall: replace the wheel-installed atom with the editable copy.
+    pip install --no-deps --force-reinstall -e .
+)
+
+# Install triton_kernels. The release atom0.1.2.post image cleans up
+# /triton-test/ from the build stage, so it's typically absent. Fall back
+# to ROCm/triton's RI3.5.x branch — NOT triton-lang/triton upstream:
+#
+#   * Upstream triton-lang/triton refactored the matmul_ogs module into
+#     matmul.py (and removed routing.py). PR #650's fused_moe_triton.py
+#     imports `from triton_kernels.matmul_ogs import matmul_ogs,
+#     PrecisionConfig` and `from triton_kernels.routing import routing`,
+#     which only resolve against the ROCm fork's release-internal branch.
+#   * ROCm/triton RI3.5.x at e491726 has matmul_ogs.py (with PrecisionConfig
+#     and matmul_ogs), routing.py, CDNA4MXScaleLayout in layout.py (the
+#     class PR #650 imports), and target_info.py that imports only is_hip /
+#     is_hip_cdna3 / is_hip_cdna4 — no is_hip_gfx1250, which the image's
+#     bundled triton would reject.
+#
+# triton_kernels is a self-contained subpackage (pyproject deps: numpy,
+# pytest); installing it does not perturb the image's triton itself.
+# Bump only after AMD ships a newer ATOM image whose bundled triton
+# exports is_hip_gfx1250, at which point we can move to a newer RI branch.
+TRITON_KERNELS_SHA="e49172654d55f460c6fc24d77a3ea8a286bcaee8"
+# --force-reinstall mirrors the atom install above: triton_kernels also ships
+# as a wheel in the image, and without --force-reinstall pip can short-circuit
+# the editable switch when name/version match, leaving the wheel build active.
+if [ -d /triton-test/python/triton_kernels/ ]; then
+    pip install --no-deps --force-reinstall -e /triton-test/python/triton_kernels/
+else
+    TRITON_DIR="/tmp/rocm-triton"
+    if [ ! -d "$TRITON_DIR/.git" ]; then
+        git clone --filter=blob:none https://github.com/ROCm/triton.git "$TRITON_DIR"
+    fi
+    (
+        cd "$TRITON_DIR"
+        git fetch --depth=1 origin "$TRITON_KERNELS_SHA" 2>/dev/null \
+            || git fetch --depth=1 origin RI3.5.x
+        git checkout --force "$TRITON_KERNELS_SHA"
+        pip install --no-deps --force-reinstall -e python/triton_kernels/
+    )
+fi
+
+# Preflight version checks. The chosen base image
+# (atom0.1.2.post, rebuilt 2026-04-23) was tagged after ATOM pinned
+# transformers==5.2.0 (commit 67d6cb61, 2026-03-13), so transformers compat
+# is expected; we still assert it explicitly to fail fast with a clear
+# message rather than timing out wait_for_server_ready on a confusing
+# import error inside the server log. The two non-trivial deps the PR
+# introduces are transformers' deepseek_v3 config class (mapped from
+# deepseek_v4 in atom/config.py) and triton_kernels.CDNA4MXScaleLayout
+# (renamed from GFX950MXScaleLayout in fused_moe_triton.py).
+python3 - <<'PYEOF'
+import importlib, os, sys
+import atom
+
+# Verify the editable install actually took effect — Python could still be
+# importing the wheel-installed atom if pip's --force-reinstall silently no-op'd
+# (e.g., the wheel and the editable copy share a setup.py path mismatch).
+atom_path = os.path.abspath(atom.__file__)
+expected = os.path.abspath(os.environ["ATOM_PR_DIR"])
+print(f"atom imported from: {atom_path}")
+if expected not in atom_path:
+    sys.exit(f"FATAL: atom is importing from {atom_path}, not from PR checkout {expected}. "
+             f"The pip --force-reinstall -e . did not take effect.")
+
+import transformers
+print(f"transformers version: {transformers.__version__}")
+
+# Use CONFIG_MAPPING directly: AutoConfig.for_model() returns an instance
+# (transformers 5.2.0 source: `return config_class(*args, **kwargs)`), not a
+# class, so `.__name__` would AttributeError. CONFIG_MAPPING maps model_type
+# to the config class directly and is unambiguous.
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+if "deepseek_v3" not in CONFIG_MAPPING:
+    sys.exit(f"FATAL: transformers in this image cannot resolve deepseek_v3 model_type. "
+             f"ATOM PR #650 maps deepseek_v4 -> deepseek_v3 in _CONFIG_REGISTRY and needs "
+             f"transformers to know the v3 schema. Available types: "
+             f"{sorted(k for k in CONFIG_MAPPING if 'deepseek' in k)}")
+print(f"deepseek_v3 config class: {CONFIG_MAPPING['deepseek_v3'].__name__}")
+
+try:
+    layout_mod = importlib.import_module("triton_kernels.tensor_details.layout")
+    if not hasattr(layout_mod, "CDNA4MXScaleLayout"):
+        avail = [n for n in dir(layout_mod) if "Layout" in n]
+        sys.exit(f"FATAL: triton_kernels.tensor_details.layout has no CDNA4MXScaleLayout. "
+                 f"PR #650's fused_moe_triton.py change renamed GFX950MXScaleLayout -> "
+                 f"CDNA4MXScaleLayout, but this image's triton_kernels still uses the old "
+                 f"name. Available Layout classes: {avail}")
+    print("triton_kernels.CDNA4MXScaleLayout: present")
+except ModuleNotFoundError as e:
+    sys.exit(f"FATAL: triton_kernels not importable. PR #650's MoE path needs it. Error: {e}")
+PYEOF
+
+# DSv4-Pro's native max_position_embeddings is 1,048,576 (1M tokens), so we
+# can't leave --max-model-len blank for 1k1k the way the dsr1-atom scripts
+# do — ATOM would allocate KV cache for 1M context and OOM during warmup
+# (~240 GiB consumed before the dummy forward, then sparse_attn's
+# torch.where wants another ~36 GiB and there isn't 36 GiB free). DSR1's
+# native context is only 128k, which is why the same blank pattern works
+# there. Set 1k1k explicitly; 8k1k retains the existing 10240 cap that's
+# already running successfully.
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len 2304 "
+else
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
+
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+
+BLOCK_SIZE=${BLOCK_SIZE:-16}
+# --enforce-eager is required: ROCm/ATOM#650 (PR1 skeleton) has no CUDAGraph
+# support yet (deferred to a follow-up PR). max-num-seqs uses the ATOM
+# default (512) — matches every other ATOM benchmark script in the repo.
+# The PR1 kv_cache[:1,...] hardcode in deepseek_v4.py means any forward
+# with batch>1 silently corrupts non-slot-0 lanes; this risk activates
+# whenever the scheduler assembles batch>1, regardless of the explicit
+# max-num-seqs value, so pinning it to 4 (the PR's offline repro value)
+# offered no protective benefit. eval (gsm8k) at conc>1 is the canary.
+python3 -m atom.entrypoints.openai_server \
+    --model $MODEL \
+    --server-port $PORT \
+    -tp $TP \
+    --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
+    --block-size $BLOCK_SIZE \
+    --enforce-eager \
+    --trust-remote-code > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1833,3 +1833,14 @@
     - "Bump --chunked-prefill-size from 4096 to 8192"
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
+    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
+    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
+    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -186,7 +186,7 @@ else
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     set -x
-    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=300 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"