diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0aaf26038..5404ab235 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1786,3 +1786,11 @@ - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144 + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Bump MI355X SLURM allocation from --time=180 to --time=300 in runners/launch_mi355x-amds.sh" + - "DSv4-Pro on MI355X exceeded the 3h cap (STEP CANCELLED DUE TO TIME LIMIT) due to ~30min MoE JIT compile plus slow torch-fallback kernels (SGLANG_HACK_FLASHMLA_BACKEND=torch et al.) from sgl-project/sglang#23608" + - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml" + - "Retriggering dsv4-fp8-mi355x-sglang" diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 2454f1bd9..279cab494 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -186,7 +186,7 @@ else LOCK_FILE="${SQUASH_FILE}.lock" set -x - salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" + salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=300 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"