diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b039f775c..be5a8df93 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -6646,3 +6646,104 @@ dsr1-fp4-b200-dynamo-sglang-mtp: tp: 8 ep: 1 dp-attn: false + +kimik2.5-fp4-gb200-dynamo-vllm: + image: vllm/vllm-openai:v0.18.0-cu130 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [256, 512, 1024, 2048, 3072, 4096] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4, 8, 16, 32, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [2048] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [3072, 4096] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f7f824b32..a3d7b5e3e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1265,3 +1265,16 @@ description: - "Add Qwen3.5-397B-A17B-FP8 H200 SGLang MTP (EAGLE speculative decoding)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1001 + +- config-keys: + - kimik2.5-fp4-gb200-dynamo-vllm + description: + - "Add Kimi K2.5 NVFP4 GB200 disaggregated multinode vLLM benchmark via Dynamo frontend" + - "Image: vllm/vllm-openai:v0.18.0-cu130" + - "Model: nvidia/Kimi-K2.5-NVFP4 with NixlConnector KV transfer, FLASHINFER_MLA attention" + - "1k1k configs: high-throughput DEP (1P1D dep4/dep16), low-latency TEP (1P4D dep4/tep4)" + - "8k1k configs: low-latency TEP (1P4D), mid-curve DEP (3P1D dep16), high-throughput (5P1D dep8, 6P1D dep16)" + - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026" + - "New framework: dynamo-vllm (Dynamo frontend + vLLM backend)" + - "Runner script updated to clone NVIDIA/srt-slurm and map vLLM container image" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f8f0ef26e..e0e55481f 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -34,6 +34,14 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1" exit 1 fi +elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + if [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then + export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" + export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" + else + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4" + exit 1 + fi else export MODEL_PATH=$MODEL fi @@ -112,9 +120,15 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" -cd "$SRT_REPO_DIR" -git checkout sa-submission-q1-2026 +if [[ $FRAMEWORK == "dynamo-vllm" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 +else + git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q1-2026 +fi echo "Installing srtctl..." curl -LsSf https://astral.sh/uv/install.sh | sh @@ -155,6 +169,7 @@ model_paths: containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} + "${IMAGE}": ${SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} EOF