From d1e492ddfa9811303f69e6a51fe98ce31cfd553e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 20:35:59 -0700
Subject: [PATCH 1/9] add dsv4-fp4-mi355x-atom Day-0 marker

Adds DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 single-sequence benchmark
config, gated to the limitations of ROCm/ATOM#650 (PR1 skeleton):
TP=8, conc=1 only, --enforce-eager, ATOM_USE_TRITON_MOE=1.

Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
(matches qwen3.5-fp8-mi355x-atom base). The DSv4 PR is overlaid at
runtime by the benchmark script via pip install --no-deps -e . from a
pinned SHA (cdbff35), so no new image needs to be published.

The script enforces PR1 invariants (CONC=1, EP_SIZE=1) and runs a
preflight that asserts the editable atom install took effect, that
transformers can resolve deepseek_v3 (the type ATOM maps deepseek_v4
to), and that triton_kernels exposes CDNA4MXScaleLayout (renamed from
GFX950MXScaleLayout in the PR).

Sweep will expand to TP=4/8 conc 4-256 once ROCm/ATOM PR3
(multi-request) and PR4 (CUDAGraph) land.
---
 .github/configs/amd-master.yaml               |  25 +++
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 190 ++++++++++++++++++
 perf-changelog.yaml                           |  10 +
 3 files changed, 225 insertions(+)
 create mode 100644 benchmarks/single_node/dsv4_fp4_mi355x_atom.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7700edf09..327ec90ee 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1489,3 +1489,28 @@ dsv4-fp8-mi355x-sglang:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
+
+# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
+# PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
+# hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on
+# gfx950. Image is the standard atom0.1.2.post MI355X base (matching
+# qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by
+# benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep
+# will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land.
+dsv4-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
new file mode 100644
index 000000000..178b77a81
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -0,0 +1,190 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE"
+
+# PR1 invariants. The YAML constrains these to 1, but a manual invocation with
+# different env vars would silently produce wrong output (kv_cache[:1,...]
+# hardcode in deepseek_v4.py corrupts state at batch>1; expert-parallel serving
+# is not validated by the PR's repro). Fail fast instead.
+if [ "$CONC" -ne 1 ]; then
+    echo "FATAL: ROCm/ATOM#650 PR1 is single-sequence only; CONC must be 1, got $CONC" >&2
+    exit 1
+fi
+if [ "$EP_SIZE" -ne 1 ]; then
+    echo "FATAL: ROCm/ATOM#650 PR1 has not validated expert parallel serving; EP_SIZE must be 1, got $EP_SIZE" >&2
+    exit 1
+fi
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+export OMP_NUM_THREADS=1
+
+# DSv4-specific ATOM env vars (from ROCm/ATOM#650 repro command).
+# The aiter fused_moe path is broken on gfx950 with a16w4+Swiglu, so PR1
+# requires the triton matmul_ogs path. AITER_LOG_LEVEL quiets the noisy
+# warmup logs that otherwise drown out the server-ready signal.
+export ATOM_USE_TRITON_MOE=1
+export AITER_LOG_LEVEL=WARNING
+
+# Apply ROCm/ATOM#650 (DSv4 PR1 skeleton) over the image's wheel-installed
+# atom. The chosen base image ships atom as a built wheel, not editable, so
+# we overlay an editable install from the PR branch at a pinned SHA. Bump
+# this SHA when the PR moves; do not track the branch tip (the run becomes
+# a moving target if the branch is force-pushed).
+ATOM_PR_SHA="cdbff359d3db7afd3801e28b38fc71253121ee84"
+export ATOM_PR_DIR="/tmp/atom-pr650"
+
+if [ ! -d "$ATOM_PR_DIR/.git" ]; then
+    git clone --filter=blob:none https://github.com/ROCm/ATOM.git "$ATOM_PR_DIR"
+fi
+(
+    cd "$ATOM_PR_DIR"
+    # Try a targeted fetch first (fast); fall back to fetching the PR ref if
+    # the server doesn't allow fetching the SHA directly.
+    git fetch --depth=1 origin "$ATOM_PR_SHA" 2>/dev/null \
+        || git fetch --depth=1 origin pull/650/head
+    git checkout --force "$ATOM_PR_SHA"
+    test "$(git rev-parse HEAD)" = "$ATOM_PR_SHA"
+    # --no-deps: don't churn the image's pinned ROCm/torch/triton/aiter.
+    # --force-reinstall: replace the wheel-installed atom with the editable copy.
+    pip install --no-deps --force-reinstall -e .
+)
+
+# PR #650's repro explicitly reinstalls triton_kernels editable. Conditional
+# in case the path differs in the chosen image; safe no-op if already present.
+if [ -d /triton-test/python/triton_kernels/ ]; then
+    pip install --no-deps -e /triton-test/python/triton_kernels/
+fi
+
+# Preflight version checks. The chosen base image
+# (atom0.1.2.post, rebuilt 2026-04-23) was tagged after ATOM pinned
+# transformers==5.2.0 (commit 67d6cb61, 2026-03-13), so transformers compat
+# is expected; we still assert it explicitly to fail fast with a clear
+# message rather than timing out wait_for_server_ready on a confusing
+# import error inside the server log. The two non-trivial deps the PR
+# introduces are transformers' deepseek_v3 config class (mapped from
+# deepseek_v4 in atom/config.py) and triton_kernels.CDNA4MXScaleLayout
+# (renamed from GFX950MXScaleLayout in fused_moe_triton.py).
+python3 - <<'PYEOF'
+import importlib, os, sys
+import atom
+
+# Verify the editable install actually took effect — Python could still be
+# importing the wheel-installed atom if pip's --force-reinstall silently no-op'd
+# (e.g., the wheel and the editable copy share a setup.py path mismatch).
+atom_path = os.path.abspath(atom.__file__)
+expected = os.path.abspath(os.environ["ATOM_PR_DIR"])
+print(f"atom imported from: {atom_path}")
+if expected not in atom_path:
+    sys.exit(f"FATAL: atom is importing from {atom_path}, not from PR checkout {expected}. "
+             f"The pip --force-reinstall -e . did not take effect.")
+
+import transformers
+print(f"transformers version: {transformers.__version__}")
+
+# Use CONFIG_MAPPING directly: AutoConfig.for_model() returns an instance
+# (transformers 5.2.0 source: `return config_class(*args, **kwargs)`), not a
+# class, so `.__name__` would AttributeError. CONFIG_MAPPING maps model_type
+# to the config class directly and is unambiguous.
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+if "deepseek_v3" not in CONFIG_MAPPING:
+    sys.exit(f"FATAL: transformers in this image cannot resolve deepseek_v3 model_type. "
+             f"ATOM PR #650 maps deepseek_v4 -> deepseek_v3 in _CONFIG_REGISTRY and needs "
+             f"transformers to know the v3 schema. Available types: "
+             f"{sorted(k for k in CONFIG_MAPPING if 'deepseek' in k)}")
+print(f"deepseek_v3 config class: {CONFIG_MAPPING['deepseek_v3'].__name__}")
+
+try:
+    layout_mod = importlib.import_module("triton_kernels.tensor_details.layout")
+    if not hasattr(layout_mod, "CDNA4MXScaleLayout"):
+        avail = [n for n in dir(layout_mod) if "Layout" in n]
+        sys.exit(f"FATAL: triton_kernels.tensor_details.layout has no CDNA4MXScaleLayout. "
+                 f"PR #650's fused_moe_triton.py change renamed GFX950MXScaleLayout -> "
+                 f"CDNA4MXScaleLayout, but this image's triton_kernels still uses the old "
+                 f"name. Available Layout classes: {avail}")
+    print("triton_kernels.CDNA4MXScaleLayout: present")
+except ModuleNotFoundError as e:
+    sys.exit(f"FATAL: triton_kernels not importable. PR #650's MoE path needs it. Error: {e}")
+PYEOF
+
+# Calculate max-model-len based on ISL and OSL
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=""
+else
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len $EVAL_MAX_MODEL_LEN "
+fi
+
+if [ "$EP_SIZE" -gt 1 ]; then
+  EP=" --enable-expert-parallel"
+else
+  EP=" "
+fi
+
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+
+set -x
+
+BLOCK_SIZE=${BLOCK_SIZE:-16}
+# --enforce-eager is required: ROCm/ATOM#650 (PR1 skeleton) has no CUDAGraph
+# support yet (deferred to a follow-up PR). --max-num-seqs 1 caps the path
+# at the single-sequence ceiling that PR1 supports — the model_runner has a
+# hardcoded kv_cache[:1,...] that silently corrupts state for batch>1.
+python3 -m atom.entrypoints.openai_server \
+    --model $MODEL \
+    --server-port $PORT \
+    -tp $TP \
+    --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
+    --block-size $BLOCK_SIZE \
+    --enforce-eager \
+    --max-num-seqs 1 > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+# After throughput, run evaluation only if RUN_EVAL is true
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+# Stop GPU monitoring
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7ed3c16ff..6d8a947de 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1833,3 +1833,13 @@
     - "Bump --chunked-prefill-size from 4096 to 8192"
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
+    - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
+    - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
+    - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TODO

From dfae4fc1f42702683dcefeb95e05cf4ac2e14c53 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 20:48:41 -0700
Subject: [PATCH 2/9] fall back to triton-lang/triton for triton_kernels

The release rocm/atom:atom0.1.2.post image cleans up the build-stage
path /triton-test/python/triton_kernels/, so the conditional editable
install was a no-op and the preflight failed with "No module named
'triton_kernels'".

Clone triton-lang/triton at a pinned SHA (028e5da5, latest commit to
python/triton_kernels/triton_kernels/tensor_details/layout.py as of
2026-04-10) and pip install --no-deps -e the subpackage. triton_kernels
is self-contained (pyproject deps: numpy, pytest), so this does not
perturb the image's triton itself.
---
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 22 +++++++++++++++++--
 perf-changelog.yaml                           |  1 +
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index 178b77a81..b0810ce4e 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -68,10 +68,28 @@ fi
     pip install --no-deps --force-reinstall -e .
 )
 
-# PR #650's repro explicitly reinstalls triton_kernels editable. Conditional
-# in case the path differs in the chosen image; safe no-op if already present.
+# Install triton_kernels. The release atom0.1.2.post image cleans up
+# /triton-test/ from the build stage, so it's typically absent; fall back to
+# upstream triton-lang/triton at a pinned SHA whose python/triton_kernels has
+# the CDNA4MXScaleLayout class PR #650 imports (the rename from
+# GFX950MXScaleLayout landed upstream in commit c69c3a95 on 2026-01-10; we
+# pin to 028e5da5 from 2026-04-10, the latest commit to that file).
+# triton_kernels is a self-contained subpackage (pyproject deps: numpy,
+# pytest) — installing it does not perturb the image's triton itself.
+TRITON_KERNELS_SHA="028e5da5"
 if [ -d /triton-test/python/triton_kernels/ ]; then
     pip install --no-deps -e /triton-test/python/triton_kernels/
+else
+    TRITON_DIR="/tmp/triton-upstream"
+    if [ ! -d "$TRITON_DIR/.git" ]; then
+        git clone --filter=blob:none https://github.com/triton-lang/triton.git "$TRITON_DIR"
+    fi
+    (
+        cd "$TRITON_DIR"
+        git fetch --depth=1 origin "$TRITON_KERNELS_SHA" 2>/dev/null || git fetch origin
+        git checkout --force "$TRITON_KERNELS_SHA"
+        pip install --no-deps -e python/triton_kernels/
+    )
 fi
 
 # Preflight version checks. The chosen base image
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 6d8a947de..dc26456a6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1839,6 +1839,7 @@
   description:
     - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
     - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to triton-lang/triton@028e5da5 which has the CDNA4MXScaleLayout class PR #650 imports"
     - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
     - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
     - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"

From 12cfe4c721605000ff45cbd8fd60e66b5ab6b3c9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 20:57:29 -0700
Subject: [PATCH 3/9] pin triton_kernels to pre-gfx1250 SHA

The previous SHA (028e5da5, 2026-04-10) imports is_hip_gfx1250 from
triton.language.target_info, which the image's installed triton (older,
matched to atom0.1.2.post) does not export. Result: ImportError at
triton_kernels load, before any of our layout-class checks run.

Pin to d28db13d (2026-03-05, parent of commit 11aac682 "[AMD] Add
is_hip_gfx1250 target check"). At this SHA layout.py still has
CDNA4MXScaleLayout (post the 2026-01-10 rename) and target_info.py
imports only is_hip / is_hip_cdna3 / is_hip_cdna4. Verified both
properties against the GitHub raw blobs at this SHA.

Bump only after the image's triton is upgraded to one that exposes
is_hip_gfx1250.
---
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 20 ++++++++++++-------
 perf-changelog.yaml                           |  2 +-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index b0810ce4e..f47c60fe6 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -70,13 +70,19 @@ fi
 
 # Install triton_kernels. The release atom0.1.2.post image cleans up
 # /triton-test/ from the build stage, so it's typically absent; fall back to
-# upstream triton-lang/triton at a pinned SHA whose python/triton_kernels has
-# the CDNA4MXScaleLayout class PR #650 imports (the rename from
-# GFX950MXScaleLayout landed upstream in commit c69c3a95 on 2026-01-10; we
-# pin to 028e5da5 from 2026-04-10, the latest commit to that file).
-# triton_kernels is a self-contained subpackage (pyproject deps: numpy,
-# pytest) — installing it does not perturb the image's triton itself.
-TRITON_KERNELS_SHA="028e5da5"
+# upstream triton-lang/triton at a pinned SHA chosen for compatibility with
+# both PR #650 and the image's installed triton:
+#   * CDNA4MXScaleLayout (renamed from GFX950MXScaleLayout) must be present,
+#     which means SHAs after 2026-01-10 (commit c69c3a95).
+#   * triton_kernels' target_info.py must NOT import is_hip_gfx1250 — that
+#     import was added on 2026-03-05 (commit 11aac682) and the image's
+#     triton is older, so it ImportErrors at module load.
+# d28db13d (parent of 11aac682) is the latest SHA satisfying both. Bump
+# this only after the image's triton is upgraded to one that has
+# is_hip_gfx1250 in triton.language.target_info.
+# triton_kernels itself is a self-contained subpackage (pyproject deps:
+# numpy, pytest), so installing it does not perturb the image's triton.
+TRITON_KERNELS_SHA="d28db13de0cf7079c5db00e37986916f96f273f2"
 if [ -d /triton-test/python/triton_kernels/ ]; then
     pip install --no-deps -e /triton-test/python/triton_kernels/
 else
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index dc26456a6..daeeb96ee 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1839,7 +1839,7 @@
   description:
     - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
     - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
-    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to triton-lang/triton@028e5da5 which has the CDNA4MXScaleLayout class PR #650 imports"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to triton-lang/triton@d28db13d, the latest SHA where layout.py has CDNA4MXScaleLayout (post-2026-01-10 rename) AND target_info.py does not yet import is_hip_gfx1250 (pre-2026-03-05, which the image's triton lacks)"
     - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
     - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
     - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"

From b6a01a75c78acf33772fb6e75b0d4ce35c7d6b23 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 21:12:50 -0700
Subject: [PATCH 4/9] switch triton_kernels source to ROCm/triton RI3.5.x

Upstream triton-lang/triton refactored matmul_ogs into matmul.py
(removing routing.py too), but PR #650's fused_moe_triton.py imports
`from triton_kernels.matmul_ogs import matmul_ogs, PrecisionConfig`
and `from triton_kernels.routing import routing`. Those only resolve
against the ROCm fork's RI3.5.x branch.

Pin to ROCm/triton@e491726 (RI3.5.x HEAD, 2025-11-20). Verified at
that SHA:
  * matmul_ogs.py has class PrecisionConfig and def matmul_ogs
  * routing.py exists
  * tensor_details/layout.py exports CDNA4MXScaleLayout
  * target_info.py imports only is_hip / is_hip_cdna3 / is_hip_cdna4
    (no is_hip_gfx1250, which the image's bundled triton rejects)

This matches what AMD's /triton-test/python/triton_kernels/ would have
been on the build host before the release image's cleanup stage.
---
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 40 +++++++++++--------
 perf-changelog.yaml                           |  2 +-
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index f47c60fe6..a0fb75e57 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -69,30 +69,36 @@ fi
 )
 
 # Install triton_kernels. The release atom0.1.2.post image cleans up
-# /triton-test/ from the build stage, so it's typically absent; fall back to
-# upstream triton-lang/triton at a pinned SHA chosen for compatibility with
-# both PR #650 and the image's installed triton:
-#   * CDNA4MXScaleLayout (renamed from GFX950MXScaleLayout) must be present,
-#     which means SHAs after 2026-01-10 (commit c69c3a95).
-#   * triton_kernels' target_info.py must NOT import is_hip_gfx1250 — that
-#     import was added on 2026-03-05 (commit 11aac682) and the image's
-#     triton is older, so it ImportErrors at module load.
-# d28db13d (parent of 11aac682) is the latest SHA satisfying both. Bump
-# this only after the image's triton is upgraded to one that has
-# is_hip_gfx1250 in triton.language.target_info.
-# triton_kernels itself is a self-contained subpackage (pyproject deps:
-# numpy, pytest), so installing it does not perturb the image's triton.
-TRITON_KERNELS_SHA="d28db13de0cf7079c5db00e37986916f96f273f2"
+# /triton-test/ from the build stage, so it's typically absent. Fall back
+# to ROCm/triton's RI3.5.x branch — NOT triton-lang/triton upstream:
+#
+#   * Upstream triton-lang/triton refactored the matmul_ogs module into
+#     matmul.py (and removed routing.py). PR #650's fused_moe_triton.py
+#     imports `from triton_kernels.matmul_ogs import matmul_ogs,
+#     PrecisionConfig` and `from triton_kernels.routing import routing`,
+#     which only resolve against the ROCm fork's release-internal branch.
+#   * ROCm/triton RI3.5.x at e491726 has matmul_ogs.py (with PrecisionConfig
+#     and matmul_ogs), routing.py, CDNA4MXScaleLayout in layout.py (the
+#     class PR #650 imports), and target_info.py that imports only is_hip /
+#     is_hip_cdna3 / is_hip_cdna4 — no is_hip_gfx1250, which the image's
+#     bundled triton would reject.
+#
+# triton_kernels is a self-contained subpackage (pyproject deps: numpy,
+# pytest); installing it does not perturb the image's triton itself.
+# Bump only after AMD ships a newer ATOM image whose bundled triton
+# exports is_hip_gfx1250, at which point we can move to a newer RI branch.
+TRITON_KERNELS_SHA="e49172654d55f460c6fc24d77a3ea8a286bcaee8"
 if [ -d /triton-test/python/triton_kernels/ ]; then
     pip install --no-deps -e /triton-test/python/triton_kernels/
 else
-    TRITON_DIR="/tmp/triton-upstream"
+    TRITON_DIR="/tmp/rocm-triton"
     if [ ! -d "$TRITON_DIR/.git" ]; then
-        git clone --filter=blob:none https://github.com/triton-lang/triton.git "$TRITON_DIR"
+        git clone --filter=blob:none https://github.com/ROCm/triton.git "$TRITON_DIR"
     fi
     (
         cd "$TRITON_DIR"
-        git fetch --depth=1 origin "$TRITON_KERNELS_SHA" 2>/dev/null || git fetch origin
+        git fetch --depth=1 origin "$TRITON_KERNELS_SHA" 2>/dev/null \
+            || git fetch --depth=1 origin RI3.5.x
         git checkout --force "$TRITON_KERNELS_SHA"
         pip install --no-deps -e python/triton_kernels/
     )
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index daeeb96ee..73ac175ad 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1839,7 +1839,7 @@
   description:
     - "Add DeepSeek-V4-Pro FP4 MI355X ATOM Day-0 marker (single-sequence, TP=8, conc=1)"
     - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post (matches qwen3.5-fp8-mi355x-atom base); ROCm/ATOM#650 overlaid at runtime via pip install --no-deps -e . from a pinned PR SHA (cdbff35) inside the benchmark script"
-    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to triton-lang/triton@d28db13d, the latest SHA where layout.py has CDNA4MXScaleLayout (post-2026-01-10 rename) AND target_info.py does not yet import is_hip_gfx1250 (pre-2026-03-05, which the image's triton lacks)"
+    - "triton_kernels is missing from the release image (build-stage path /triton-test/python/triton_kernels/ is cleaned up); the script falls back to ROCm/triton@e491726 (RI3.5.x), which has matmul_ogs.py and routing.py (PR #650 imports both — upstream triton-lang/triton refactored matmul_ogs into matmul.py and removed routing) plus CDNA4MXScaleLayout and a target_info.py compatible with the image's bundled triton"
     - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
     - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
     - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"

From b1266244cc3d79023ab293857556e2972eb35fd1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 22:19:19 -0700
Subject: [PATCH 5/9] max-len

---
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index a0fb75e57..38f9869a9 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -155,9 +155,16 @@ except ModuleNotFoundError as e:
     sys.exit(f"FATAL: triton_kernels not importable. PR #650's MoE path needs it. Error: {e}")
 PYEOF
 
-# Calculate max-model-len based on ISL and OSL
+# DSv4-Pro's native max_position_embeddings is 1,048,576 (1M tokens), so we
+# can't leave --max-model-len blank for 1k1k the way the dsr1-atom scripts
+# do — ATOM would allocate KV cache for 1M context and OOM during warmup
+# (~240 GiB consumed before the dummy forward, then sparse_attn's
+# torch.where wants another ~36 GiB and there isn't 36 GiB free). DSR1's
+# native context is only 128k, which is why the same blank pattern works
+# there. Set 1k1k explicitly; 8k1k retains the existing 10240 cap that's
+# already running successfully.
 if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=""
+    CALCULATED_MAX_MODEL_LEN=" --max-model-len 2304 "
 else
     CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 "
 fi
@@ -180,9 +187,13 @@ set -x
 
 BLOCK_SIZE=${BLOCK_SIZE:-16}
 # --enforce-eager is required: ROCm/ATOM#650 (PR1 skeleton) has no CUDAGraph
-# support yet (deferred to a follow-up PR). --max-num-seqs 1 caps the path
-# at the single-sequence ceiling that PR1 supports — the model_runner has a
-# hardcoded kv_cache[:1,...] that silently corrupts state for batch>1.
+# support yet (deferred to a follow-up PR). --max-num-seqs 4 matches the PR's
+# verified offline repro command (atom.examples.simple_inference) — using 1
+# left the warmup phase hung at 0% GPU even though the YAML constrains the
+# client-side concurrency to 1. The single-sequence kv_cache[:1,...] hardcode
+# in the model is still the actual correctness ceiling, but with max-concurrency
+# pinned to 1 on the client (via the CONC=1 sanity check above) the server
+# never sees a real batch>1 forward.
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
@@ -190,7 +201,7 @@ python3 -m atom.entrypoints.openai_server \
     --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
     --block-size $BLOCK_SIZE \
     --enforce-eager \
-    --max-num-seqs 1 > $SERVER_LOG 2>&1 &
+    --max-num-seqs 4 > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From 28623c1d2b0601cee27a057eeafd926e72c8b5d1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 23:19:45 -0700
Subject: [PATCH 6/9] progress

---
 benchmarks/single_node/dsv4_fp4_mi355x_atom.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index 38f9869a9..506272a60 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -63,6 +63,24 @@ fi
         || git fetch --depth=1 origin pull/650/head
     git checkout --force "$ATOM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$ATOM_PR_SHA"
+
+    # WORKAROUND: PR #650 has no env-var toggle to disable the aiter
+    # mhc_pre/mhc_post kernels, and on this image those kernels crash with
+    # a HIPGuardImplMasqueradingAsCUDA INTERNAL ASSERT inside aiter the
+    # first time the model executes the hc_pre path during prefill. SGLang's
+    # DSv4 recipe disables the same family explicitly
+    # (SGLANG_OPT_USE_TILELANG_MHC_PRE/POST=false, _DEEPGEMM_HC_PRENORM=false),
+    # which corroborates that aiter's MHC path is unreliable here. Force
+    # the torch fallback by NULL-ing the aiter lookups; deepseek_v4.py's
+    # hc_pre/hc_post check `mhc_pre is not None` before taking the aiter
+    # path, and the torch path is the PR's own reference implementation.
+    # Slow but correct. Remove once PR #650 (or a follow-up) lands a real
+    # toggle for this kernel family.
+    sed -i 's|mhc_pre = getattr(_aiter, "mhc_pre", None)|mhc_pre = None  # patched out (HIP device-guard crash)|' atom/models/deepseek_v4.py
+    sed -i 's|mhc_post = getattr(_aiter, "mhc_post", None)|mhc_post = None  # patched out (HIP device-guard crash)|' atom/models/deepseek_v4.py
+    grep -c "patched out" atom/models/deepseek_v4.py | grep -q '^2$' \
+        || { echo "FATAL: mhc_pre/mhc_post sed patch did not apply twice"; exit 1; }
+
     # --no-deps: don't churn the image's pinned ROCm/torch/triton/aiter.
     # --force-reinstall: replace the wheel-installed atom with the editable copy.
     pip install --no-deps --force-reinstall -e .

From 01a8f79fe233548de3918d0ef786d6dbfdf56557 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 00:46:07 -0700
Subject: [PATCH 7/9] restore aiter mhc_post; only patch out mhc_pre

The HIP device-guard crash stack only implicated mhc_pre_big_fuse from
the aiter MHC family. Patching out both mhc_pre and mhc_post forced the
torch fallback for the full MHC pipeline (RMSNorm + linear + 20-iter
Sinkhorn + reduce, all in fp32), which is roughly 40% slower than the
PR's reported aiter-MHC baseline of 213 ms/token.

Restore aiter mhc_post and keep only mhc_pre on the torch fallback. If
mhc_post turns out to crash with the same HIP guard assertion on a real
forward, re-add the second sed line; for now the cheaper experiment is
to find out whether it works.

Verify-grep updated from `^2$` to `^1$` so the script still FATALs if
upstream renames or removes the mhc_pre lookup site.
---
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index 506272a60..7f9b5bf19 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -64,22 +64,20 @@ fi
     git checkout --force "$ATOM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$ATOM_PR_SHA"
 
-    # WORKAROUND: PR #650 has no env-var toggle to disable the aiter
-    # mhc_pre/mhc_post kernels, and on this image those kernels crash with
-    # a HIPGuardImplMasqueradingAsCUDA INTERNAL ASSERT inside aiter the
-    # first time the model executes the hc_pre path during prefill. SGLang's
+    # WORKAROUND: PR #650 has no env-var toggle to disable the aiter MHC
+    # kernels, and on this image aiter's `mhc_pre_big_fuse` crashes with a
+    # HIPGuardImplMasqueradingAsCUDA INTERNAL ASSERT the first time the
+    # model executes the hc_pre path during prefill (a HIP/CUDA device-type
+    # mismatch inside aiter, not something we can fix from outside). SGLang's
     # DSv4 recipe disables the same family explicitly
-    # (SGLANG_OPT_USE_TILELANG_MHC_PRE/POST=false, _DEEPGEMM_HC_PRENORM=false),
-    # which corroborates that aiter's MHC path is unreliable here. Force
-    # the torch fallback by NULL-ing the aiter lookups; deepseek_v4.py's
-    # hc_pre/hc_post check `mhc_pre is not None` before taking the aiter
-    # path, and the torch path is the PR's own reference implementation.
-    # Slow but correct. Remove once PR #650 (or a follow-up) lands a real
-    # toggle for this kernel family.
+    # (SGLANG_OPT_USE_TILELANG_MHC_PRE/POST=false, _DEEPGEMM_HC_PRENORM=false).
+    # Force only `mhc_pre` to torch-fallback; leave `mhc_post` on the aiter
+    # path since the crash stack only implicated mhc_pre and we'd like to
+    # recover the perf of half the MHC pipeline. If mhc_post crashes too on
+    # the next run, add the second sed back.
     sed -i 's|mhc_pre = getattr(_aiter, "mhc_pre", None)|mhc_pre = None  # patched out (HIP device-guard crash)|' atom/models/deepseek_v4.py
-    sed -i 's|mhc_post = getattr(_aiter, "mhc_post", None)|mhc_post = None  # patched out (HIP device-guard crash)|' atom/models/deepseek_v4.py
-    grep -c "patched out" atom/models/deepseek_v4.py | grep -q '^2$' \
-        || { echo "FATAL: mhc_pre/mhc_post sed patch did not apply twice"; exit 1; }
+    grep -c "patched out" atom/models/deepseek_v4.py | grep -q '^1$' \
+        || { echo "FATAL: mhc_pre sed patch did not apply"; exit 1; }
 
     # --no-deps: don't churn the image's pinned ROCm/torch/triton/aiter.
     # --force-reinstall: replace the wheel-installed atom with the editable copy.

From 00b3c1ce51254a5e0f5902eb117350f33ab8d97a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 00:57:25 -0700
Subject: [PATCH 8/9] bump MI355X / GHA timeouts from 300 to 500 minutes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

dsv4-fp4-mi355x-atom (ROCm/ATOM#650 PR1, single-sequence at TP=8 with
torch-fallback hc_pre because aiter mhc_pre crashes on this image)
runs at ~5 min per request in steady state. With 1k1k at 12 prompts
plus 8k1k at the same shape, the full sweep can exceed the 300-min
cap that #1148 set for the SGLang-DSv4 path.

Bump both the SLURM allocation in runners/launch_mi355x-amds.sh and
the GitHub Actions timeout-minutes in benchmark-tmpl.yml together —
either expiring first kills the job, so they need to stay aligned.

Note: this is a global bump that affects every MI355X benchmark and
every job that uses the shared workflow template, not just the dsv4
ATOM one. Drop back to 300 once the slow paths are gone (PR4
CUDAGraph + a working aiter MHC).
---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 runners/launch_mi355x-amds.sh        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index e5a590ef3..61259bdf2 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -98,7 +98,7 @@ permissions:
 jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
-    timeout-minutes: 300
+    timeout-minutes: 500
     name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} | disagg-${{ inputs.disagg }} spec-${{ inputs.spec-decoding }} conc-${{ inputs.conc }}${{ inputs.eval-only && ' | eval-only' || (inputs.run-eval && ' | eval' || '') }}"
     steps:
       - name: Resource cleanup (pre-run)
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 279cab494..03de35a62 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -186,7 +186,7 @@ else
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     set -x
-    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=300 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"

From ec4aa77277230c69542e33b8b65432daca318871 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 01:45:03 -0700
Subject: [PATCH 9/9] expand conc sweep, drop max-num-seqs, address review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bundle of changes from this iteration:

- amd-master.yaml: expand search-space from `conc-start/end: 1` to
  explicit conc=[1, 4, 16, 32] for both 1k1k and 8k1k. conc-list isn't
  supported in the single-node sweep generator (only multinode), so use
  four single-conc entries per ISL.

- dsv4_fp4_mi355x_atom.sh: drop --max-num-seqs 4 — every other ATOM
  benchmark in the repo uses ATOM's default 512, and the explicit value
  offered no protective benefit. The PR1 kv_cache[:1,...] hardcode
  corrupts non-slot-0 lanes whenever the scheduler assembles batch>1,
  regardless of max-num-seqs. The CONC=1 sanity check was already
  dropped to allow the conc>1 sweep entries; eval (gsm8k) at conc>1
  is the canary for kv_cache[:1] silent corruption.

Address @claude review on PR #1165:

- Add --trust-remote-code on both server launch and run_benchmark_serving
  (mirrors qwen3.5_fp8_mi355x_atom.sh:52,72 — the peer ATOM recipe on
  the same atom0.1.2.post image). The model has loaded successfully so
  far without it (ATOM's _CONFIG_REGISTRY does intercept), but the flag
  is the prevailing convention and a one-line defense if any startup
  path resolves AutoConfig.from_pretrained before the registry shim.

- Add --force-reinstall to both triton_kernels editable installs (the
  /triton-test/ path and the ROCm/triton fallback). Mirrors the atom
  install above, which uses --force-reinstall for the same wheel-shadow
  reason. Without it, pip can short-circuit the editable switch when
  the wheel's name/version match the source tree.

- Fix pr-link in perf-changelog.yaml from /pull/TODO to /pull/1165.
---
 .github/configs/amd-master.yaml               |  6 +++
 .../single_node/dsv4_fp4_mi355x_atom.sh       | 41 ++++++++++---------
 perf-changelog.yaml                           |  2 +-
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 327ec90ee..1c431427e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1510,7 +1510,13 @@ dsv4-fp4-mi355x-atom:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+    - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 }
+    - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
+    - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 }
+    - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 }
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
index 7f9b5bf19..58740432c 100644
--- a/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/dsv4_fp4_mi355x_atom.sh
@@ -19,14 +19,13 @@ fi
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE"
 
-# PR1 invariants. The YAML constrains these to 1, but a manual invocation with
-# different env vars would silently produce wrong output (kv_cache[:1,...]
-# hardcode in deepseek_v4.py corrupts state at batch>1; expert-parallel serving
-# is not validated by the PR's repro). Fail fast instead.
-if [ "$CONC" -ne 1 ]; then
-    echo "FATAL: ROCm/ATOM#650 PR1 is single-sequence only; CONC must be 1, got $CONC" >&2
-    exit 1
-fi
+# EP_SIZE > 1 is still unvalidated by PR #650's repro (offline TP=8 EP=1
+# only). Keep the EP guard. The CONC guard was relaxed to empirically
+# probe whether kv_cache[:1,...] in deepseek_v4.py actually corrupts at
+# batch>1 in the server path: max-num-seqs=4 caps the running batch
+# below the YAML's max conc (32), and per-sequence eval correctness will
+# tell us if the hardcode bites. If gsm8k accuracy collapses at conc>1,
+# put `if [ "$CONC" -ne 1 ]; then exit 1` back.
 if [ "$EP_SIZE" -ne 1 ]; then
     echo "FATAL: ROCm/ATOM#650 PR1 has not validated expert parallel serving; EP_SIZE must be 1, got $EP_SIZE" >&2
     exit 1
@@ -104,8 +103,11 @@ fi
 # Bump only after AMD ships a newer ATOM image whose bundled triton
 # exports is_hip_gfx1250, at which point we can move to a newer RI branch.
 TRITON_KERNELS_SHA="e49172654d55f460c6fc24d77a3ea8a286bcaee8"
+# --force-reinstall mirrors the atom install above: triton_kernels also ships
+# as a wheel in the image, and without --force-reinstall pip can short-circuit
+# the editable switch when name/version match, leaving the wheel build active.
 if [ -d /triton-test/python/triton_kernels/ ]; then
-    pip install --no-deps -e /triton-test/python/triton_kernels/
+    pip install --no-deps --force-reinstall -e /triton-test/python/triton_kernels/
 else
     TRITON_DIR="/tmp/rocm-triton"
     if [ ! -d "$TRITON_DIR/.git" ]; then
@@ -116,7 +118,7 @@ else
         git fetch --depth=1 origin "$TRITON_KERNELS_SHA" 2>/dev/null \
             || git fetch --depth=1 origin RI3.5.x
         git checkout --force "$TRITON_KERNELS_SHA"
-        pip install --no-deps -e python/triton_kernels/
+        pip install --no-deps --force-reinstall -e python/triton_kernels/
     )
 fi
 
@@ -203,13 +205,13 @@ set -x
 
 BLOCK_SIZE=${BLOCK_SIZE:-16}
 # --enforce-eager is required: ROCm/ATOM#650 (PR1 skeleton) has no CUDAGraph
-# support yet (deferred to a follow-up PR). --max-num-seqs 4 matches the PR's
-# verified offline repro command (atom.examples.simple_inference) — using 1
-# left the warmup phase hung at 0% GPU even though the YAML constrains the
-# client-side concurrency to 1. The single-sequence kv_cache[:1,...] hardcode
-# in the model is still the actual correctness ceiling, but with max-concurrency
-# pinned to 1 on the client (via the CONC=1 sanity check above) the server
-# never sees a real batch>1 forward.
+# support yet (deferred to a follow-up PR). max-num-seqs uses the ATOM
+# default (512) — matches every other ATOM benchmark script in the repo.
+# The PR1 kv_cache[:1,...] hardcode in deepseek_v4.py means any forward
+# with batch>1 silently corrupts non-slot-0 lanes; this risk activates
+# whenever the scheduler assembles batch>1, regardless of the explicit
+# max-num-seqs value, so pinning it to 4 (the PR's offline repro value)
+# offered no protective benefit. eval (gsm8k) at conc>1 is the canary.
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
@@ -217,7 +219,7 @@ python3 -m atom.entrypoints.openai_server \
     --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \
     --block-size $BLOCK_SIZE \
     --enforce-eager \
-    --max-num-seqs 4 > $SERVER_LOG 2>&1 &
+    --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -234,7 +236,8 @@ run_benchmark_serving \
     --num-prompts "$((CONC * 10))" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --trust-remote-code
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 73ac175ad..256c08d7b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1843,4 +1843,4 @@
     - "Model: deepseek-ai/DeepSeek-V4-Pro (same canonical checkpoint used by dsv4-fp4-b300-vllm); compatibility with PR #650's WeightsMapper not yet verified — first run will tell us"
     - "Pinned to PR1 limitations: single-sequence kv_cache hardcode, --enforce-eager required, ATOM_USE_TRITON_MOE=1 (aiter fused_moe broken on gfx950)"
     - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TODO
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1165