From f08413b262b19be3d9c5b2558d7a094b724aa148 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 20:08:54 -0700
Subject: [PATCH] Bump MI355X SLURM time-limit to 300m; retrigger
 dsv4-fp8-mi355x-sglang

DSv4-Pro on MI355X was hitting `STEP CANCELLED DUE TO TIME LIMIT` at the
3h SLURM cap. ~30min MoE JIT compile plus slow torch-fallback kernels
(SGLANG_HACK_FLASHMLA_BACKEND=torch and the SGLANG_OPT_*=false set from
sgl-project/sglang#23608) push the 8k1k high-concurrency runs past 180m.

Bumping --time=180 -> --time=300 in runners/launch_mi355x-amds.sh. 300
matches the GH Actions outer timeout-minutes in benchmark-tmpl.yml so we
don't get cut off by the runner SLURM step before the workflow cap.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml           | 8 ++++++++
 runners/launch_mi355x-amds.sh | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0aaf26038..5404ab235 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1786,3 +1786,11 @@
     - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048"
     - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Bump MI355X SLURM allocation from --time=180 to --time=300 in runners/launch_mi355x-amds.sh"
+    - "DSv4-Pro on MI355X exceeded the 3h cap (STEP CANCELLED DUE TO TIME LIMIT) due to ~30min MoE JIT compile plus slow torch-fallback kernels (SGLANG_HACK_FLASHMLA_BACKEND=torch et al.) from sgl-project/sglang#23608"
+    - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml"
+    - "Retriggering dsv4-fp8-mi355x-sglang"
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 2454f1bd9..279cab494 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -186,7 +186,7 @@ else
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     set -x
-    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
+    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=300 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"