diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7e975fdba..ec3a67103 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7722,3 +7722,38 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 8
         ep: 8
         dp-attn: true
+
+dsv4-fp4-gb300-dynamo-sglang:
+  # _arm64 variant: GH runner pod doing `enroot import` is amd64, but
+  # gb300-cw compute nodes are aarch64 (Grace). Without the explicit
+  # arm64 tag the registry serves the amd64 manifest, which fails to
+  # exec on the compute side.
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-cw
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  # Uses the sa-bench recipe copied exactly from NVIDIA/srt-slurm:
+  # recipes/dsv4-pro/sglang/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml
+  # at commit 9d75f82acec163594658a440f39dd7f1bd35bd16.
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1 prefill worker and 3 decode workers, each TP=8.
+    - conc-list: [32, 64, 128, 256, 512, 1024]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/dsv4-pro/sglang/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml"
+      decode:
+        num-worker: 3
+        tp: 8
+        ep: 1
+        dp-attn: false
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 60f3299cf..f574c629c 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -139,3 +139,8 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 75036a986..45330b378 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -171,6 +171,9 @@ jobs:
           set -x
           # Export RESULT_FILENAME early so it's available for artifact uploads even if cancelled
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
+          rm -f multinode_server_logs.tar.gz
+          rm -rf LOGS
+          rm -f ${RESULT_FILENAME}_*.json agg_${RESULT_FILENAME}_*.json
 
           export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }}
           export IS_MULTINODE=true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p1d-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p1d-tp8.yaml
new file mode 100644
index 000000000..158f5d299
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p1d-tp8.yaml
@@ -0,0 +1,80 @@
+# DeepSeek-V4-Pro disaggregated on GB200 (1P3D, TP=8, MXFP4)
+#
+# AIME 2025 (aime25): all 30 problems, full concurrency
+
+name: "dsv4-pro-gb200-1k1k-disagg-1p1d-tp8-aime"
+
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dspro"
+  container: "dspro-0426-nixl"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  decode_environment:
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  sglang_config:
+    prefill:
+      disaggregation-bootstrap-port: 30001
+      served-model-name: "dspro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 8192
+      disable-flashinfer-autotune: true
+      max-running-requests: 1024
+      cuda-graph-max-bs: 2048
+      mem-fraction-static: 0.85
+
+    decode:
+      served-model-name: "dspro"
+      disaggregation-bootstrap-port: 30001
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 8192
+      disable-flashinfer-autotune: true
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      mem-fraction-static: 0.85
+
+benchmark:
+  type: "aime"
+  aime_dataset: "aime25"
+  num_threads: 30
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml
new file mode 100644
index 000000000..9e81c512a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml
@@ -0,0 +1,83 @@
+# DeepSeek-V4-Pro disaggregated on GB200 (1P3D, TP=8, MXFP4)
+#
+# Some basic rate matching
+# TODO: no optimizations have been applied yet
+
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dspro"
+  container: "dspro-0426"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  decode_nodes: 6
+  prefill_workers: 1
+  decode_workers: 3
+  gpus_per_node: 4
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  decode_environment:
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  sglang_config:
+    prefill:
+      disaggregation-bootstrap-port: 30001
+      served-model-name: "dspro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 8192
+      disable-flashinfer-autotune: true
+      max-running-requests: 1024
+      cuda-graph-max-bs: 2048
+      mem-fraction-static: 0.85
+
+    decode:
+      served-model-name: "dspro"
+      disaggregation-bootstrap-port: 30001
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 8192
+      disable-flashinfer-autotune: true
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      mem-fraction-static: 0.85
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "32x64x128x256x512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/8k1k/disagg/stp/disagg-1p1d-tp8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/8k1k/disagg/stp/disagg-1p1d-tp8.yaml
new file mode 100644
index 000000000..a8e11de01
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4/8k1k/disagg/stp/disagg-1p1d-tp8.yaml
@@ -0,0 +1,94 @@
+# DeepSeek-V4-Pro disaggregated on GB200 (1P1D, TP=8, MXFP4) — 8k1k newtp + dspro-0426.
+# WIP
+
+name: "gb200-mxfp4-8k1k-disagg-newtp"
+
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dspro"
+  container: "dspro-0426"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+
+  decode_environment:
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+
+  sglang_config:
+    prefill:
+      disaggregation-bootstrap-port: 30001
+      served-model-name: "dspro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 8192
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      swa-full-tokens-ratio: 0.1
+
+    decode:
+      served-model-name: "dspro"
+      disaggregation-bootstrap-port: 30001
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 8192
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      swa-full-tokens-ratio: 0.1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "32x64x128x256x512"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2bd14e776..4969573e0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1877,7 +1877,7 @@
     - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
     - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185
-  
+
 - config-keys:
     - dsv4-fp4-b200-sglang
   description:
@@ -1985,3 +1985,12 @@
     - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
     - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-sglang
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)"
+    - "Topology: 1 prefill worker + 3 decode workers, TP=8, MXFP4 MoE kernels, NIXL KV transfer"
+    - "Recipes copied exactly from NVIDIA/srt-slurm recipes/dsv4-pro/sglang/gb200-fp4 at commit 9d75f82acec163594658a440f39dd7f1bd35bd16"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
new file mode 100755
index 000000000..70c03987b
--- /dev/null
+++ b/runners/launch_gb300-cw.sh
@@ -0,0 +1,303 @@
+#!/usr/bin/bash
+
+# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw
+# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in
+# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang
+# recipes are copied exactly from the pinned srt-slurm commit below.
+
+set -x
+
+archive_server_logs() {
+    if [ -n "${LOGS_DIR:-}" ] && [ -d "$LOGS_DIR" ]; then
+        local workspace="${GITHUB_WORKSPACE:-$(pwd)}"
+        echo "Archiving server logs from $LOGS_DIR"
+        rm -rf "$workspace/LOGS"
+        cp -r "$LOGS_DIR" "$workspace/LOGS" || true
+        tar czf "$workspace/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . || true
+    fi
+}
+
+trap 'status=$?; archive_server_logs; exit $status' EXIT
+trap 'echo "Received termination signal"; exit 143' INT TERM
+
+if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on the shared VAST mount; no compute-node-local
+    # NVMe on cw. The exact upstream recipes refer to this model as
+    # `dspro`.
+    export MODEL_PATH="/mnt/vast/models/dsv4/"
+else
+    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang"
+    exit 1
+fi
+
+# CoreWeave cluster has a single `all` partition; account `cw-sup` is
+# what `sacctmgr show assoc user=$USER` returns there. `benchmark`
+# (inherited from gb200-nv) does not exist on cw.
+export SLURM_PARTITION="all"
+export SLURM_ACCOUNT="cw-sup"
+
+# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env
+# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so)
+# to mount into the container. cw doesn't set them by default — without
+# them the container has no libcuda and CUDA init fails. SLURM's default
+# --export=ALL propagates these from this shell through sbatch+srun
+# into the enroot environment.
+export NVIDIA_VISIBLE_DEVICES=all
+export NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+NGINX_IMAGE="nginx:1.27.4"
+SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16"
+
+# Squash files live alongside models on /mnt/vast (shared across nodes).
+# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /
+# pyxis rejects '+' in image paths with "Invalid image format", and the
+# old /mnt/vast/squash dir contains '+'-separated files from prior runs.
+SQUASH_DIR="/mnt/vast/squash_dupe"
+mkdir -p "$SQUASH_DIR"
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+
+enroot import -o $SQUASH_FILE docker://$IMAGE
+enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
+export ISL="$ISL"
+export OSL="$OSL"
+
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits
+# hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
+echo "Cloning srt-slurm repository..."
+SRT_REPO_DIR="srt-slurm"
+if [ -d "$SRT_REPO_DIR" ]; then
+    echo "Removing existing $SRT_REPO_DIR..."
+    rm -rf "$SRT_REPO_DIR"
+fi
+
+git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+cd "$SRT_REPO_DIR"
+git checkout "$SRT_SLURM_RECIPES_COMMIT"
+
+# Overlay the local copy of the exact pinned recipes. This keeps the PR
+# self-contained while preserving byte-for-byte recipe content from
+# NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT.
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4" recipes/dsv4-pro/sglang/gb200-fp4
+
+echo "Installing srtctl..."
+# CRITICAL — uv install location.
+# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is
+# shared NFS across both. srtctl's slurm template (job_script_minimal.j2)
+# does `if ! command -v uv` and skips its own ARM64 install when uv is
+# already on PATH; on compute nodes $HOME/.local/bin is on PATH by
+# default, so a stray x86 binary at $HOME/.local/bin/uv from this
+# runner shadows the template's install and crashes the orchestrator
+# with `cannot execute binary file: Exec format error`. Install to a
+# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86
+# uv left in the shared path by prior runs.
+rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx"
+export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin"
+mkdir -p "$XDG_BIN_HOME"
+curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh
+export PATH="$XDG_BIN_HOME:$PATH"
+
+if [ ! -x "$XDG_BIN_HOME/uv" ]; then
+    echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2
+    exit 1
+fi
+if [ -e "$HOME/.local/bin/uv" ]; then
+    echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2
+    exit 1
+fi
+
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+if ! command -v srtctl &> /dev/null; then
+    echo "Error: Failed to install srtctl"
+    exit 1
+fi
+
+echo "Configs available at: $SRT_REPO_DIR/"
+
+SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+echo "Creating srtslurm.yaml configuration..."
+cat > srtslurm.yaml <<EOF
+# SRT SLURM Configuration for GB300-CW (SGLang)
+
+default_account: "${SLURM_ACCOUNT}"
+default_partition: "${SLURM_PARTITION}"
+default_time_limit: "8:00:00"
+
+gpus_per_node: 4
+network_interface: ""
+
+srtctl_root: "${SRTCTL_ROOT}"
+
+model_paths:
+  dspro: "${MODEL_PATH}"
+  dsv4-pro: "${MODEL_PATH}"
+containers:
+  dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
+  dspro-0426: ${SQUASH_FILE}
+  dspro-0426-nixl: ${SQUASH_FILE}
+  dsv4-grace-blackwell: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
+  nginx: ${NGINX_SQUASH_FILE}
+  nginx-sqsh: ${NGINX_SQUASH_FILE}
+# Use one contiguous CW segment for the full allocation. This is a
+# cluster-level setting, not a recipe overlay; the copied recipe files
+# stay byte-identical to the pinned upstream commit.
+use_segment_sbatch_directive: true
+EOF
+
+echo "Generated srtslurm.yaml:"
+cat srtslurm.yaml
+
+echo "Running make setup..."
+make setup ARCH=aarch64
+
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
+echo "Submitting job with srtctl..."
+
+# Use the runner name for the submitted job. Some exact upstream recipes do
+# not define `name`, so insert it into only the cloned runtime copy.
+if grep -q '^name:' "$CONFIG_FILE"; then
+    sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
+else
+    TMP_CONFIG_FILE="$(mktemp)"
+    awk -v runner_name="${RUNNER_NAME}" 'BEGIN { print "name: \"" runner_name "\"" } { print }' "$CONFIG_FILE" > "$TMP_CONFIG_FILE"
+    mv "$TMP_CONFIG_FILE" "$CONFIG_FILE"
+fi
+
+# CoreWeave needs explicit CPU and memory allocation for srt-slurm SGLang
+# jobs. Apply this only to the cloned runtime copy so the committed
+# NVIDIA recipe files stay byte-identical to the pinned source.
+if ! grep -q '^sbatch_directives:' "$CONFIG_FILE"; then
+    {
+        echo ""
+        echo "# CoreWeave runtime-only Slurm resource directives."
+        echo "sbatch_directives:"
+        echo '  cpus-per-task: "144"'
+        echo '  mem: "0"'
+    } >> "$CONFIG_FILE"
+fi
+
+SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+echo "$SRTCTL_OUTPUT"
+
+JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
+
+set +x
+
+if [ -z "$JOB_ID" ]; then
+    echo "Error: Failed to extract JOB_ID from srtctl output"
+    exit 1
+fi
+
+echo "Extracted JOB_ID: $JOB_ID"
+
+LOGS_DIR="outputs/$JOB_ID/logs"
+LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
+
+while ! ls "$LOG_FILE" &>/dev/null; do
+    if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+        echo "ERROR: Job $JOB_ID failed before creating log file"
+        scontrol show job "$JOB_ID"
+        exit 1
+    fi
+    echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."
+    sleep 5
+done
+
+(
+    while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+        sleep 10
+    done
+) &
+POLL_PID=$!
+
+echo "Tailing LOG_FILE: $LOG_FILE"
+
+tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+wait $POLL_PID
+
+set -x
+
+echo "Job $JOB_ID completed!"
+echo "Collecting results..."
+
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    archive_server_logs
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
+
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
+
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        shopt -s nullglob
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] || continue
+            cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+        shopt -u nullglob
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index 2a6389a78..d33f1c1a5 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -47,6 +47,7 @@ def base_env_vars():
         "OSL": "1024",
         "DISAGG": "false",
         "MODEL_PREFIX": "dsr1",
+        "IMAGE": "test-image",
     }