SemiAnalysisAI · functionstackx · Apr 7, 2026 · Apr 6, 2026 · Apr 6, 2026
@@ -6646,3 +6646,104 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         tp: 8
         ep: 1
         dp-attn: false
+
+kimik2.5-fp4-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:v0.18.0-cu130
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - conc-list: [256, 512, 1024, 2048, 3072, 4096]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
+        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    - conc-list: [4, 8, 16, 32, 64, 128]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
+        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
+      decode:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: false
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - conc-list: [4, 8, 16, 32, 128]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
+        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
+      decode:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: false
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
+        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    - conc-list: [2048]
+      prefill:
+        num-worker: 5
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    - conc-list: [3072, 4096]
+      prefill:
+        num-worker: 6
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
+        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1265,3 +1265,16 @@
   description:
     - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001
+
+- config-keys:
+    - kimik2.5-fp4-gb200-dynamo-vllm
+  description:
+    - "Add Kimi K2.5 NVFP4 GB200 disaggregated multinode vLLM benchmark via Dynamo frontend"
+    - "Image: vllm/vllm-openai:v0.18.0-cu130"
+    - "Model: nvidia/Kimi-K2.5-NVFP4 with NixlConnector KV transfer, FLASHINFER_MLA attention"
+    - "1k1k configs: high-throughput DEP (1P1D dep4/dep16), low-latency TEP (1P4D dep4/tep4)"
+    - "8k1k configs: low-latency TEP (1P4D), mid-curve DEP (3P1D dep16), high-throughput (5P1D dep8, 6P1D dep16)"
+    - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
+    - "New framework: dynamo-vllm (Dynamo frontend + vLLM backend)"
+    - "Runner script updated to clone NVIDIA/srt-slurm and map vLLM container image"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
@@ -34,6 +34,14 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then
         echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1"
         exit 1
     fi
+elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+    if [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then
+        export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
+        export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
+    else
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4"
+        exit 1
+    fi
 else
     export MODEL_PATH=$MODEL
 fi
@@ -112,9 +120,15 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
-cd "$SRT_REPO_DIR"
-git checkout sa-submission-q1-2026
+if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout sa-submission-q2-2026
+else
+    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout sa-submission-q1-2026
+fi
 
 echo "Installing srtctl..."
 curl -LsSf https://astral.sh/uv/install.sh | sh
@@ -155,6 +169,7 @@ model_paths:
 containers:
   dynamo-trtllm: ${SQUASH_FILE}
   dynamo-sglang: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
   nginx-sqsh: ${NGINX_SQUASH_FILE}
 EOF