From b0a6c06e3fc4e7e8ec759e620666990a43b125fc Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 15:00:29 -0700 Subject: [PATCH 1/7] Revert DSR1 FP4 MI355X SGLang image to mori-0227-3 Missed staging this change before merging #1000. --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f1181b941..993a075bd 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -953,7 +953,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1161,7 +1161,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg From f6106d209a49bc91ee8ffce6b8cf0460cd886b9b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 15:06:35 -0700 Subject: [PATCH 2/7] Add GB300 and H200 evals-only runs to perf-changelog --- perf-changelog.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 16fea938d..d6841f18f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,15 @@ +- config-keys: + - dsr1-fp4-gb300-dynamo-trt + - dsr1-fp8-gb300-dynamo-trt + - dsr1-fp4-gb300-dynamo-sglang + - dsr1-fp8-gb300-dynamo-sglang + - dsr1-fp8-h200-dynamo-trt + - dsr1-fp8-h200-dynamo-sglang + description: + - "Add GB300 and H200 multinode evals-only runs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094 + evals-only: true + - config-keys: - kimik2.5-fp4-gb200-dynamo-trt description: From 0c71d9bc666905093676936a794221980536e4ab Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 15:35:31 -0700 Subject: [PATCH 3/7] Remove GB300 from evals-only perf-changelog, keep H200 only --- perf-changelog.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d6841f18f..c529de7ca 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,12 +1,8 @@ - config-keys: - - dsr1-fp4-gb300-dynamo-trt - - dsr1-fp8-gb300-dynamo-trt - - dsr1-fp4-gb300-dynamo-sglang - - dsr1-fp8-gb300-dynamo-sglang - dsr1-fp8-h200-dynamo-trt - dsr1-fp8-h200-dynamo-sglang description: - - "Add GB300 and H200 multinode evals-only runs" + - "Add H200 multinode evals-only runs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094 evals-only: true From b45342bbed44e11ac276cdf4ba34114b06f8a016 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 15:46:36 -0700 Subject: [PATCH 4/7] Move H200 evals-only entry to bottom of perf-changelog --- perf-changelog.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c529de7ca..1efb95fcc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,11 +1,3 @@ -- config-keys: - - dsr1-fp8-h200-dynamo-trt - - dsr1-fp8-h200-dynamo-sglang - description: - - "Add H200 multinode evals-only runs" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094 - evals-only: true - - config-keys: - kimik2.5-fp4-gb200-dynamo-trt description: @@ -1648,3 +1640,11 @@ - "Follows the glm5-fp8-b200-sglang launch recipe as requested, plus EAGLE speculative decoding (num-steps=3, eagle-topk=1, num-draft-tokens=4) behind SGLANG_ENABLE_SPEC_V2=1" - "Configs: 1k1k and 8k1k, TP8/EP1 conc 4-4 + TP4/EP1 conc 4-256 with spec-decoding=mtp" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + +- config-keys: + - dsr1-fp8-h200-dynamo-trt + - dsr1-fp8-h200-dynamo-sglang + description: + - "Add H200 multinode evals-only runs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1094 + evals-only: true From d7794141c167786e2b4ea9ff2e109320de9e8096 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 17:45:13 -0700 Subject: [PATCH 5/7] Bump H200 health check timeout to 2hrs via sed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same approach as B200 launcher — overrides max_attempts to 720 in the srt-slurm config before submitting. Default 180 (30 min) is too short for disagg SGLang EAGLE cold start. --- runners/launch_h200-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 4dba44931..790045496 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -126,6 +126,7 @@ EOF # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" + sed -i 's/^ max_attempts: [0-9]*/ max_attempts: 720/' "${CONFIG_FILE%%:*}" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" From f53f03ebf1b0e4226bd312410003d6386957e158 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 20:34:21 -0700 Subject: [PATCH 6/7] Fix H200 health check timeout: append block when missing H200 recipes don't have a health_check section (unlike B200), so the sed replacement was a silent no-op. Now appends the block if max_attempts isn't found in the config file. --- runners/launch_h200-dgxc-slurm.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 790045496..fa48a6c59 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -126,7 +126,13 @@ EOF # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" - sed -i 's/^ max_attempts: [0-9]*/ max_attempts: 720/' "${CONFIG_FILE%%:*}" + # Bump health check timeout: replace if exists, append if not + CONFIG_YAML="${CONFIG_FILE%%:*}" + if grep -q 'max_attempts' "$CONFIG_YAML"; then + sed -i 's/^ max_attempts: [0-9]*/ max_attempts: 720/' "$CONFIG_YAML" + else + printf '\nhealth_check:\n max_attempts: 720\n interval_seconds: 10\n' >> "$CONFIG_YAML" + fi SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" From c49c732b8ed590c452f10f7adf6115b77f2a959a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 19 Apr 2026 22:26:34 -0700 Subject: [PATCH 7/7] Simplify H200 health check: strip existing block then append --- runners/launch_h200-dgxc-slurm.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index fa48a6c59..e11ca7b20 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -126,13 +126,8 @@ EOF # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" - # Bump health check timeout: replace if exists, append if not - CONFIG_YAML="${CONFIG_FILE%%:*}" - if grep -q 'max_attempts' "$CONFIG_YAML"; then - sed -i 's/^ max_attempts: [0-9]*/ max_attempts: 720/' "$CONFIG_YAML" - else - printf '\nhealth_check:\n max_attempts: 720\n interval_seconds: 10\n' >> "$CONFIG_YAML" - fi + sed -i '/^health_check:/,/^[^ ]/{ /^health_check:/d; /^ /d; }' "${CONFIG_FILE%%:*}" + printf '\nhealth_check:\n max_attempts: 720\n interval_seconds: 10\n' >> "${CONFIG_FILE%%:*}" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT"