From 80d98084fa584510838598497208d4f149da3fe6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 09:50:25 -0500
Subject: [PATCH 01/16] srt-slurm: upstream recipes and add first-class
 `recipe:` field

Recipes referenced from NVIDIA/srt-slurm@sa-submission-q2-2026 are now tracked
under benchmarks/multi_node/srt-slurm-recipes/, mirroring the upstream
`recipes/` layout. The master-yaml plumbing for selecting one is hoisted out
of `prefill.additional-settings: ["CONFIG_FILE=recipes/..."]` into a
first-class `recipe:` field on the search-space entry, validated against
on-disk paths so unknown recipes fail fast at sweep generation. The benchmark
template resolves it to an absolute scratch-copy path passed to launchers as
CONFIG_FILE, so launcher behavior is unchanged otherwise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/CONFIGS.md                    |   26 +
 .github/configs/nvidia-master.yaml            | 1512 +++++------------
 .../workflows/benchmark-multinode-tmpl.yml    |   23 +
 .github/workflows/e2e-tests.yml               |    2 +
 .github/workflows/run-sweep.yml               |    2 +
 .../srt-slurm-recipes/b200-fp4/1k1k.yaml      |  259 +++
 .../srt-slurm-recipes/b200-fp4/8k1k.yaml      |  351 ++++
 .../srt-slurm-recipes/b200-fp8/1k1k.yaml      |  281 +++
 .../b200-fp8/8k1k_mtp_lowlat_0.yaml           |  141 ++
 .../b200-fp8/8k1k_mtp_lowlat_1.yaml           |  141 ++
 .../b200-fp8/8k1k_mtp_lowlat_2.yaml           |  141 ++
 .../b200-fp8/8k1k_mtp_maxtpt_0.yaml           |  144 ++
 .../b200-fp8/8k1k_mtp_maxtpt_1.yaml           |  144 ++
 .../b200-fp8/8k1k_mtp_maxtpt_2.yaml           |  144 ++
 .../b200-fp8/8k1k_mtp_maxtpt_3.yaml           |  144 ++
 .../b200-fp8/8k1k_stp_lowlat_0.yaml           |  139 ++
 .../b200-fp8/8k1k_stp_lowlat_1.yaml           |  139 ++
 .../b200-fp8/8k1k_stp_lowlat_2.yaml           |  139 ++
 .../b200-fp8/8k1k_stp_maxtpt_0.yaml           |  140 ++
 .../b200-fp8/8k1k_stp_maxtpt_1.yaml           |  140 ++
 .../b200-fp8/8k1k_stp_maxtpt_2.yaml           |  140 ++
 .../b200-fp8/8k1k_stp_maxtpt_3.yaml           |  140 ++
 .../gb200-fp4/1k1k/low-latency.yaml           |  116 ++
 .../gb200-fp4/1k1k/max-tpt.yaml               |  183 ++
 .../gb200-fp4/1k1k/mid-curve.yaml             |  182 ++
 .../gb200-fp4/8k1k/low-latency.yaml           |  118 ++
 .../gb200-fp4/8k1k/max-tpt.yaml               |  179 ++
 .../gb200-fp4/8k1k/mid-curve.yaml             |  179 ++
 .../gb200-fp8/1k1k/low-latency.yaml           |  121 ++
 .../gb200-fp8/1k1k/max-tpt.yaml               |  175 ++
 .../gb200-fp8/1k1k/mid-curve.yaml             |  174 ++
 .../gb200-fp8/1k1k/ultra-tpt.yaml             |  176 ++
 .../gb200-fp8/8k1k/low-latency.yaml           |  117 ++
 .../gb200-fp8/8k1k/max_tpt.yaml               |  171 ++
 .../gb200-fp8/8k1k/mid-curve.yaml             |  170 ++
 .../gb300-fp4/1k1k/low_latency.yaml           |  116 ++
 .../gb300-fp4/1k1k/max_tpt.yaml               |  184 ++
 .../gb300-fp4/1k1k/mid_curve.yaml             |  182 ++
 .../gb300-fp4/8k1k/low_latency.yaml           |  119 ++
 .../gb300-fp4/8k1k/max_tpt.yaml               |  179 ++
 .../gb300-fp4/8k1k/mid_curve.yaml             |  179 ++
 .../gb300-fp8/1k1k/stp/low-latency.yaml       |  122 ++
 .../gb300-fp8/1k1k/stp/max.yaml               |  171 ++
 .../gb300-fp8/1k1k/stp/mid.yaml               |  170 ++
 .../gb300-fp8/8k1k/stp/low-latency.yaml       |  121 ++
 .../gb300-fp8/8k1k/stp/max.yaml               |  171 ++
 .../gb300-fp8/8k1k/stp/mid.yaml               |  171 ++
 .../1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml   |  114 ++
 .../1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml    |  116 ++
 .../h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml  |  102 ++
 .../h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml   |  102 ++
 .../8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml   |  116 ++
 .../8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml    |  116 ++
 .../h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml  |  102 ++
 .../h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml   |  102 ++
 .../h200/1k1k/bs256-1p6d-dep-mtp.yaml         |  121 ++
 .../h200/1k1k/bs256-1p6d-dep.yaml             |  109 ++
 .../h200/1k1k/bs256-1p6d-tp-mtp.yaml          |  118 ++
 .../h200/1k1k/bs256-1p6d-tp.yaml              |  109 ++
 .../h200/1k1k/low-latency-1p9d-mtp.yaml       |  116 ++
 .../h200/1k1k/low-latency-1p9d.yaml           |  106 ++
 .../h200/8k1k/bs128-1p1d-dep-mtp.yaml         |  118 ++
 .../h200/8k1k/bs128-1p1d-dep.yaml             |  109 ++
 .../h200/8k1k/bs16-1p3d-mtp.yaml              |  116 ++
 .../h200/8k1k/bs16-1p3d.yaml                  |  107 ++
 .../h200/8k1k/bs4-1p7d-mtp.yaml               |  116 ++
 .../srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml |  107 ++
 .../h200/8k1k/bs64-2p3d-mtp.yaml              |  125 ++
 .../h200/8k1k/bs64-2p3d.yaml                  |  115 ++
 .../h200/8k1k/bs8-1p6d-mtp.yaml               |  117 ++
 .../srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml |  108 ++
 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml |  125 ++
 ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml |  129 ++
 ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml |  217 +++
 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml |  138 ++
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml |  122 ++
 ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml |  153 ++
 ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml |  137 ++
 ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml |  126 ++
 ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml |  123 ++
 ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml |  126 ++
 ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml |  124 ++
 ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml |  126 ++
 ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml |  155 ++
 ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml |  138 ++
 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml    |  119 ++
 .../ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml    |  117 ++
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |  112 ++
 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml    |  126 ++
 .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml   |  120 ++
 .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml   |  126 ++
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml   |  117 ++
 .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml   |  114 ++
 .../ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml    |  112 ++
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml |  106 ++
 .../ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml    |  127 ++
 .../ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml    |  116 ++
 .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml |  116 ++
 .../ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml    |  123 ++
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |  112 ++
 .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml |  119 ++
 .../ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml    |  120 ++
 .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml   |  124 ++
 .../ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml    |  119 ++
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml |  107 ++
 .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml |  120 ++
 .../ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml    |  115 ++
 .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml   |  118 ++
 .../ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml    |  111 ++
 .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml   |  114 ++
 ...x1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml |  121 ++
 ...x1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml |  121 ++
 ...x1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml |  121 ++
 ...tx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml |  121 ++
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml    |  121 ++
 .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml |  121 ++
 .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml   |  121 ++
 .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml   |  121 ++
 ...x1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml |  115 ++
 ...tx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml |  115 ++
 ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml |  115 ++
 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml |  115 ++
 ...tx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml |  115 ++
 ...x2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml |  115 ++
 .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml   |  123 ++
 .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml  |  123 ++
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml   |  123 ++
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml    |  123 ++
 ...ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml |  125 ++
 .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml |  125 ++
 ...x4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml |  125 ++
 ...tx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml |  115 ++
 ...tx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml |  115 ++
 .../ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml    |  117 ++
 ...ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml |  115 ++
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml |  116 ++
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml  |  116 ++
 .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml  |  116 ++
 ...tx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml |  115 ++
 .../ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml    |  127 ++
 .../ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml    |  125 ++
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |  123 ++
 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml    |  139 ++
 .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml   |  129 ++
 .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml   |  130 ++
 .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml   |  131 ++
 .../ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml    |  121 ++
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |  117 ++
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml |  121 ++
 .../ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml    |  136 ++
 .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml   |  124 ++
 .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml  |  129 ++
 .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml   |  123 ++
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml  |  129 ++
 .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml |  127 ++
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |  123 ++
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml |  126 ++
 .../ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml    |  125 ++
 .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml   |  128 ++
 .../ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml    |  123 ++
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml    |  121 ++
 .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml |  130 ++
 .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml |  118 ++
 .../ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml    |  120 ++
 .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml   |  128 ++
 .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml   |  128 ++
 ...tx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml |  133 ++
 ...x1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml |  133 ++
 ...ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml |  133 ++
 .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml |  134 ++
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml   |  134 ++
 ...x3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml |  133 ++
 ...x1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml |  127 ++
 ...tx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml |  127 ++
 ...ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml |  127 ++
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml  |  128 ++
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml |  128 ++
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml |  128 ++
 ...2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml |  127 ++
 .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml   |  133 ++
 .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml  |  134 ++
 .../ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml    |  134 ++
 .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml   |  134 ++
 .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml |  133 ++
 .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml |  133 ++
 .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml  |  128 ++
 .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml   |  128 ++
 .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml |  127 ++
 .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml |  127 ++
 .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml |  128 ++
 ...tx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml |  127 ++
 ...x7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml |  127 ++
 .../ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml    |  117 ++
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |  121 ++
 ...ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml |  152 ++
 .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml |  128 ++
 .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml   |  213 +++
 .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml   |  113 ++
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml   |  175 ++
 .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml   |  207 +++
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |  110 ++
 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml    |  125 ++
 ...ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml |  146 ++
 .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml   |  119 ++
 ...tx11_gen1_dep16_batch256_eplb256_mtp1.yaml |  152 ++
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |  123 ++
 .../ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml    |  117 ++
 .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml |  128 ++
 .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml   |  119 ++
 ...tx10_gen1_dep16_batch256_eplb256_mtp0.yaml |  146 ++
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml    |  121 ++
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |  112 ++
 .../ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml    |  112 ++
 .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml   |  115 ++
 .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml  |  127 ++
 ...x1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml |  127 ++
 ...tx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml |  121 ++
 ...x1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml |  151 ++
 ...x1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml |  183 ++
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml   |  120 ++
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml  |  120 ++
 .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml  |  121 ++
 ...1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml |  129 ++
 ...x1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml |  117 ++
 ...ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml |  114 ++
 ...x1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml |  177 ++
 ...x1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml |  209 +++
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml   |  114 ++
 .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml  |  115 ++
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml   |  120 ++
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml  |  120 ++
 .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml |  119 ++
 ...tx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml |  121 ++
 ...ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml |  127 ++
 ...ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml |  120 ++
 ...tx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml |  123 ++
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml |  116 ++
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml   |  114 ++
 .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml  |  114 ++
 ...ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml |  114 ++
 ...tx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml |  117 ++
 ...tx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml |  115 ++
 ...x5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml |  121 ++
 .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml |  121 ++
 .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml   |  216 +++
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |  119 ++
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |  124 ++
 ...ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml |  139 ++
 .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml |  127 ++
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |  113 ++
 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml    |  127 ++
 .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml   |  117 ++
 .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml  |  241 +++
 ...ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml |  149 ++
 .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml   |  122 ++
 .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml  |  123 ++
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml  |  151 ++
 ...ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml |  131 ++
 .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml |  122 ++
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |  119 ++
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml |  122 ++
 .../ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml    |  119 ++
 .../ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml    |  120 ++
 .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml  |  146 ++
 .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml |  129 ++
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml    |  117 ++
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |  113 ++
 .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml |  114 ++
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml |  114 ++
 .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml   |  116 ++
 .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml   |  122 ++
 ...tx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml |  126 ++
 ...ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml |  122 ++
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml   |  123 ++
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml  |  123 ++
 ...2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml |  138 ++
 ...tx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml |  124 ++
 ...x3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml |  186 ++
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml |  119 ++
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml   |  117 ++
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml  |  117 ++
 ...2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml |  132 ++
 ...x2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml |  120 ++
 ...x3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml |  180 ++
 ...3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml |  212 +++
 ...10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml |  130 ++
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml   |  123 ++
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml  |  123 ++
 ...ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml |  123 ++
 ...x7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml |  138 ++
 ...tx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml |  126 ++
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml   |  117 ++
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml  |  117 ++
 .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml  |  118 ++
 ...tx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml |  120 ++
 ...tx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml |  118 ++
 ...x7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml |  124 ++
 ...x7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml |  148 ++
 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml   |  105 ++
 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml   |  109 ++
 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml    |  101 ++
 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml  |  114 ++
 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml   |  100 ++
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   98 ++
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   98 ++
 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml   |  102 ++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |   99 ++
 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml   |   97 ++
 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml   |   99 ++
 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml    |   95 ++
 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml    |   96 ++
 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml   |   94 +
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |   92 +
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |   92 +
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |   93 +
 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml  |  127 ++
 .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml    |  101 ++
 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml   |  103 ++
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   99 ++
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   99 ++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |  100 ++
 .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml    |  102 ++
 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml   |  104 ++
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |   94 +
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |  104 ++
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |  104 ++
 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml   |   97 ++
 ...28_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml |  107 ++
 ...16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml |  137 ++
 .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml |  117 ++
 ...56_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml |  107 ++
 ...2_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |  107 ++
 ...4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |  135 ++
 ...12_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml |  153 ++
 ...64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml |  137 ++
 ...8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |  107 ++
 ...28_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml |  182 ++
 ...16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |  147 ++
 .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml  |  113 ++
 ...56_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml |  101 ++
 ...32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |  147 ++
 ...c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |  147 ++
 ...12_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml |  182 ++
 ...64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |  147 ++
 ...c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |  147 ++
 ...128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml |  117 ++
 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml |  117 ++
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml  |  117 ++
 ...256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml |  117 ++
 ...c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml |  117 ++
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml |  117 ++
 ...512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml |  117 ++
 ...c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml |  117 ++
 .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml |  117 ++
 ...28_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml |  114 ++
 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml |  111 ++
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml  |  111 ++
 ...56_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml |  111 ++
 ...32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml |  111 ++
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml |  111 ++
 ...12_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml |  111 ++
 ...64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml |  111 ++
 .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml |  111 ++
 .../1k1k/disagg-gb200-1p1d-dep4-dep16.yaml    |  101 ++
 .../1k1k/disagg-gb200-1p4d-dep4-tep4.yaml     |   98 ++
 .../8k1k/disagg-gb200-1p4d-dep4-tep4.yaml     |   98 ++
 .../8k1k/disagg-gb200-3p1d-dep4-dep16.yaml    |  101 ++
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     |  101 ++
 .../8k1k/disagg-gb200-6p1d-dep4-dep16.yaml    |  101 ++
 runners/launch_b200-dgxc.sh                   |    2 +-
 runners/launch_b300-nv.sh                     |    2 +-
 runners/launch_gb200-nv.sh                    |   26 +-
 runners/launch_gb300-nv.sh                    |    2 +-
 runners/launch_h100-dgxc-slurm.sh             |    2 +-
 runners/launch_h200-dgxc-slurm.sh             |    2 +-
 utils/matrix_logic/generate_sweep_configs.py  |    6 +
 utils/matrix_logic/validation.py              |   33 +
 377 files changed, 47030 insertions(+), 1142 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
index 9d3c24309..f383f20ba 100644
--- a/.github/configs/CONFIGS.md
+++ b/.github/configs/CONFIGS.md
@@ -47,6 +47,32 @@ Notes:
 - No extra fields besides the ones listed may be specified, or else the benchmarks will fail to run.
 - Setting the fields above, particularly `ep` and `dp-attn`, only guarantee that the respective values will be passed as environment variables to the benchmark scripts! Actually using those environment variables is an implementation detail at the level of the benchmark Bash script.
 
+## Multi-node srt-slurm recipes
+
+Multi-node configs that dispatch via `srt-slurm` (i.e. `srtctl apply -f …`) reference their recipe as a first-class field on the search-space entry:
+
+```yaml
+search-space:
+- spec-decoding: "mtp"
+  conc-list: [1214]
+  recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
+  prefill:
+    num-worker: 1
+    tp: 4
+    ep: 4
+    dp-attn: true
+  decode:
+    num-worker: 2
+    tp: 8
+    ep: 8
+    dp-attn: true
+```
+
+- `recipe` is a path **relative to `benchmarks/multi_node/srt-slurm-recipes/`** in this repo. The schema validator rejects entries whose recipe file does not exist on disk, so adding a new entry requires upstreaming the recipe yaml here first.
+- The path may carry an `:override[N]` / `:override_<name>` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`.
+- `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset.
+- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` mirroring the upstream NVIDIA/srt-slurm `recipes/` layout (e.g. `trtllm/b200-fp4/...`, `vllm/deepseek-v4/...`, `gb200-fp4/...`). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form.
+
 ## Runners
 
 The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `b200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs.
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9e4177ee8..4a03b1c0f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13,14 +13,12 @@ dsr1-fp4-b200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [1214]
+      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -28,14 +26,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [875]
+      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -43,14 +39,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [6]
+      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -58,14 +52,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 15, 25, 45, 90, 180]
+      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -73,14 +65,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 4968 ]
+      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -88,14 +78,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [10860]
+      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml"
       decode:
         num-worker: 5
         tp: 4
@@ -104,84 +92,72 @@ dsr1-fp4-b200-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [4096]
+      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [2192]
+      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [1365]
+      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [6]
+      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [10, 15, 25, 45, 90, 180]
+      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [450]
+      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -193,14 +169,12 @@ dsr1-fp4-b200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [90]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -208,14 +182,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [66]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -223,14 +195,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [6]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -238,14 +208,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 15, 30, 60]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -253,14 +221,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [548]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -268,14 +234,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1096, 1691]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -283,14 +247,12 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [658]
+      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -299,84 +261,72 @@ dsr1-fp4-b200-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [6]
+      recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [10, 15, 25, 50, 100]
+      recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [370]
+      recipe: "trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [1606]
+      recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [837]
+      recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [2222]
+      recipe: "trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -399,14 +349,12 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - Low latency (TP attention)
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -414,14 +362,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -429,14 +375,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -444,14 +388,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [256]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -460,14 +402,12 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - High throughput (DP attention)
     - spec-decoding: "mtp"
       conc-list: [896]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -475,14 +415,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1024]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -490,14 +428,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1184]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -505,14 +441,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1600]
+      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -521,42 +455,36 @@ dsr1-fp8-b200-dynamo-trt:
 
     # Non-MTP (STP) configurations - Low latency (TP attention)
     - conc-list: [4]
+      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [32]
+      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [128]
+      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -564,42 +492,36 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     # Non-MTP (STP) configurations - High throughput (DP attention)
     - conc-list: [1920]
+      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [4096]
+      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [5152]
+      recipe: "trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -612,14 +534,12 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - Low latency (TP attention)
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -627,14 +547,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -642,14 +560,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [48]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -657,14 +573,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -673,14 +587,12 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - High throughput (DP attention)
     - spec-decoding: "mtp"
       conc-list: [224]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -688,14 +600,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [288]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -703,14 +613,12 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1088]
+      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml"
       prefill:
         num-worker: 4
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -719,56 +627,48 @@ dsr1-fp8-b200-dynamo-trt:
 
     # Non-MTP (STP) configurations - Low latency (TP attention)
     - conc-list: [1]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [32]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [128]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [96]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -776,56 +676,48 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     # Non-MTP (STP) configurations - High throughput (DP attention)
     - conc-list: [128]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [128]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [256]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [640]
+      recipe: "trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
-        - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -848,14 +740,12 @@ dsr1-fp4-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [654]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -863,14 +753,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [271]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -878,14 +766,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [11]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -893,14 +779,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 20, 25, 60, 120, 200]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -908,14 +792,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [2342]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -923,14 +805,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [8609]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -938,14 +818,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [12926]
+      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -954,98 +832,84 @@ dsr1-fp4-b300-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [1176]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [6]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [5, 10, 15, 25]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [60, 110, 195, 395]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [4405]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [8192]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [4611]
+      recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -1057,14 +921,12 @@ dsr1-fp4-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [2198]
+      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 10
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1072,14 +934,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [52]
+      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 4
@@ -1087,14 +947,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -1102,14 +960,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
+      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -1117,14 +973,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [181]
+      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1132,14 +986,12 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1197]
+      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml"
       prefill:
         num-worker: 9
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1148,98 +1000,84 @@ dsr1-fp4-b300-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [105]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [63]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [4]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [12]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [589]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [1093]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 6
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [2048]
+      recipe: "trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 8
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1262,14 +1100,12 @@ dsr1-fp8-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [10]
+      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -1277,14 +1113,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [160]
+      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -1292,14 +1126,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [3072]
+      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1307,14 +1139,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2560]
+      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -1322,14 +1152,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [720]
+      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -1337,14 +1165,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [11264]
+      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -1355,98 +1181,84 @@ dsr1-fp8-b300-dynamo-trt:
     osl: 1024
     search-space:
     - conc-list: [2112]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [3072]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: true
     - conc-list: [1280]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 1
         dp-attn: true
     - conc-list: [12]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml"
       decode:
         num-worker: 8
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [128]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml"
       decode:
         num-worker: 8
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [384]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml"
       decode:
         num-worker: 8
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [16384]
+      recipe: "trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1458,14 +1270,12 @@ dsr1-fp8-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [40]
+      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -1473,14 +1283,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -1488,14 +1296,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [20]
+      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -1503,14 +1309,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [72]
+      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1518,14 +1322,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [144]
+      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1533,14 +1335,12 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
+      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -1551,98 +1351,84 @@ dsr1-fp8-b300-dynamo-trt:
     osl: 1024
     search-space:
     - conc-list: [64]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [16]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml"
       decode:
         num-worker: 8
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [256]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
     - conc-list: [512]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
     - conc-list: [256]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [1075]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml"
       prefill:
         num-worker: 5
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
     - conc-list: [3072]
+      recipe: "trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
-        - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -2654,14 +2440,12 @@ dsr1-fp8-h200-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [1]
+      recipe: "trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 11
         tp: 8
@@ -2669,14 +2453,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4]
+      recipe: "trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 11
         tp: 8
@@ -2684,14 +2466,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 11
         tp: 8
@@ -2699,14 +2479,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [16]
+      recipe: "trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 9
         tp: 8
@@ -2714,14 +2492,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
+      recipe: "trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 11
         tp: 8
@@ -2729,14 +2505,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
+      recipe: "trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 8
         tp: 8
@@ -2744,14 +2518,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [128]
+      recipe: "trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -2759,14 +2531,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [256]
+      recipe: "trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -2774,14 +2544,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
+      recipe: "trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -2789,126 +2557,108 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [1]
+      recipe: "trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [4]
+      recipe: "trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [8]
+      recipe: "trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [16]
+      recipe: "trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [32]
+      recipe: "trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [64]
+      recipe: "trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [128]
+      recipe: "trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 9
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [256]
+      recipe: "trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 6
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [512]
+      recipe: "trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -2920,14 +2670,12 @@ dsr1-fp8-h200-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [1]
+      recipe: "trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -2935,14 +2683,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4]
+      recipe: "trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -2950,14 +2696,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -2965,14 +2709,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [16]
+      recipe: "trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -2980,14 +2722,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
+      recipe: "trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 5
         tp: 8
@@ -2995,14 +2735,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
+      recipe: "trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -3010,14 +2748,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [128]
+      recipe: "trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -3025,14 +2761,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [256]
+      recipe: "trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -3040,14 +2774,12 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
+      recipe: "trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -3055,126 +2787,108 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [1]
+      recipe: "trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 7
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [4]
+      recipe: "trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 7
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [8]
+      recipe: "trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 6
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [16]
+      recipe: "trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [32]
+      recipe: "trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [64]
+      recipe: "trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [128]
+      recipe: "trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [256]
+      recipe: "trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [512]
+      recipe: "trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -3197,14 +2911,12 @@ dsr1-fp8-h100-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [6]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3212,14 +2924,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3227,14 +2937,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [30]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3242,14 +2950,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [60]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3257,14 +2963,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [117]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3272,14 +2976,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [231]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3287,14 +2989,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [462]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3302,14 +3002,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [615]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3317,14 +3015,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
+      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3332,126 +3028,108 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [6]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [9]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [30]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [60]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [231]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [462]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [924]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [1845]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [4916]
+      recipe: "trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3463,14 +3141,12 @@ dsr1-fp8-h100-dynamo-trt:
     # MTP configurations (6 points)
     - spec-decoding: "mtp"
       conc-list: [6]
+      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3478,14 +3154,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
+      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3493,14 +3167,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [30]
+      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 16
@@ -3508,14 +3180,12 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [77]
+      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3525,14 +3195,12 @@ dsr1-fp8-h100-dynamo-trt:
     # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509
     # - spec-decoding: "mtp"
     #   conc-list: [78]
+    #   recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
     #     ep: 16
     #     dp-attn: true
-    #     additional-settings:
-    #     # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
-    #     - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml"
     #   decode:
     #     num-worker: 2
     #     tp: 16
@@ -3540,14 +3208,12 @@ dsr1-fp8-h100-dynamo-trt:
     #     dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [154]
+      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 2
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3555,70 +3221,60 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     # STP configurations (5 points)
     - conc-list: [6]
+      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [9]
+      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [30]
+      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [154]
+      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 16
         ep: 16
         dp-attn: false
     - conc-list: [308]
+      recipe: "trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 16
         ep: 16
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3838,13 +3494,12 @@ dsr1-fp8-h100-dynamo-sglang:
     search-space:
     # # STP: Max throughput TEP (1 prefill, 2 decode)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
+    #   recipe: "h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
     #     ep: 1
     #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml"
     #   decode:
     #     num-worker: 2
     #     tp: 16
@@ -3852,13 +3507,12 @@ dsr1-fp8-h100-dynamo-sglang:
     #     dp-attn: false
     # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64]
+    #   recipe: "h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
     #     ep: 1
     #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml"
     #   decode:
     #     num-worker: 1
     #     tp: 16
@@ -3867,13 +3521,12 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput TEP (1 prefill, 2 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
+      recipe: "h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
       decode:
         num-worker: 2
         tp: 16
@@ -3882,13 +3535,12 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64]
+      recipe: "h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3899,13 +3551,12 @@ dsr1-fp8-h100-dynamo-sglang:
     search-space:
     # # STP: Max throughput TEP (1 prefill, 1 decode)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
+    #   recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
     #     ep: 1
     #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml"
     #   decode:
     #     num-worker: 1
     #     tp: 16
@@ -3913,13 +3564,12 @@ dsr1-fp8-h100-dynamo-sglang:
     #     dp-attn: false
     # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64]
+    #   recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
     #     ep: 1
     #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml"
     #   decode:
     #     num-worker: 1
     #     tp: 16
@@ -3928,13 +3578,12 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput TEP (1 prefill, 1 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
+      recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -3943,13 +3592,12 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64]
+      recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4039,14 +3687,12 @@ dsr1-fp4-gb200-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [ 180 ]
+      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4054,14 +3700,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 4, 8, 12, 24, 48 ]
+      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -4069,14 +3713,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 4301 ]
+      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4084,14 +3726,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 2253 ]
+      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4099,14 +3739,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 16130 ]
+      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml"
       decode:
         num-worker: 5
         tp: 4
@@ -4116,98 +3754,84 @@ dsr1-fp4-gb200-dynamo-trt:
 
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4301 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [ 666 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [ 6144 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml"
       decode:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
     - conc-list: [ 12, 24, 48, 96, 192 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [ 5 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [ 4301 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [ 2253 ]
+      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4220,14 +3844,12 @@ dsr1-fp4-gb200-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [ 4, 8, 12, 24, 48 ]
+      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -4235,14 +3857,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 180 ]
+      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4250,14 +3870,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 1229 ]
+      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4265,14 +3883,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 666 ]
+      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 8
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4280,14 +3896,12 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 4301 ]
+      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
       prefill:
         num-worker: 11
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4296,84 +3910,72 @@ dsr1-fp4-gb200-dynamo-trt:
 
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 12, 44, 76 ]
+      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [ 5 ]
+      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [ 333 ]
+      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [ 1229 ]
+      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [ 2253 ]
+      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 8
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [ 4096 ]
+      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 10
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4397,14 +3999,12 @@ dsr1-fp8-gb200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [4301]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -4412,14 +4012,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2151]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -4427,14 +4025,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4442,14 +4038,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [615]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4457,14 +4051,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [36]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4472,14 +4064,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [18]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4487,14 +4077,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
+      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4502,98 +4090,84 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
   # 1k1k STP configs
     - conc-list: [6144]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [4301]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [2151]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [1127]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [256]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [27]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [3]
+      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4605,14 +4179,12 @@ dsr1-fp8-gb200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [666]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -4620,14 +4192,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4635,14 +4205,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -4650,14 +4218,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4665,14 +4231,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [90]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4680,14 +4244,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [15]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4695,14 +4257,12 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [6]
+      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4710,98 +4270,84 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
   # 8k1k STP configs
     - conc-list: [1229]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [666]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml"
       prefill:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [615]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [333]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [63]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [18]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [6]
+      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -4824,14 +4370,12 @@ dsr1-fp8-gb200-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
     - conc-list: [4, 8]
+      recipe: "gb200-fp8/1k1k/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/low-latency.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml"
       decode:
         num-worker: 1
         tp: 4
@@ -4840,14 +4384,12 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
     - conc-list: [1024, 2048, 4096]
+      recipe: "gb200-fp8/1k1k/mid-curve.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml"
       decode:
         num-worker: 1
         tp: 48
@@ -4856,14 +4398,12 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [1024, 2048, 4096, 6144]
+      recipe: "gb200-fp8/1k1k/max-tpt.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4872,14 +4412,12 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8)
     - conc-list: [4096]
+      recipe: "gb200-fp8/1k1k/ultra-tpt.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -4891,14 +4429,12 @@ dsr1-fp8-gb200-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
     - conc-list: [4, 8, 16]
+      recipe: "gb200-fp8/8k1k/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/low-latency.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -4907,14 +4443,12 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [512, 1024, 2048, 6144]
+      recipe: "gb200-fp8/8k1k/mid-curve.yaml"
       prefill:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4923,14 +4457,12 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
     - conc-list: [2048, 4096, 6144]
+      recipe: "gb200-fp8/8k1k/max_tpt.yaml"
       prefill:
         num-worker: 6
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml
-        - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml"
       decode:
         num-worker: 1
         tp: 24
@@ -4952,14 +4484,12 @@ dsr1-fp8-gb300-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
     - conc-list: [4, 8, 16, 32]
+      recipe: "gb300-fp8/1k1k/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml
-        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml"
       decode:
         num-worker: 4
         tp: 4
@@ -4968,14 +4498,12 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [1024, 2048, 4096, 6144]
+      recipe: "gb300-fp8/1k1k/stp/mid.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml
-        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -4984,14 +4512,12 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
     - conc-list: [4096, 7168, 7680]
+      recipe: "gb300-fp8/1k1k/stp/max.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/max.yaml
-        - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -5003,14 +4529,12 @@ dsr1-fp8-gb300-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
     - conc-list: [4, 8]
+      recipe: "gb300-fp8/8k1k/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml
-        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml"
       decode:
         num-worker: 1
         tp: 4
@@ -5019,14 +4543,12 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [128, 256, 512, 1024]
+      recipe: "gb300-fp8/8k1k/stp/mid.yaml"
       prefill:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml
-        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5035,14 +4557,12 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
     - conc-list: [2048, 4096]
+      recipe: "gb300-fp8/8k1k/stp/max.yaml"
       prefill:
         num-worker: 6
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/max.yaml
-        - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml"
       decode:
         num-worker: 1
         tp: 24
@@ -5066,13 +4586,12 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Low latency (1 prefill node, 2 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32 ]
+      recipe: "gb200-fp4/1k1k/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb200-fp4/1k1k/low-latency.yaml"
       decode:
         num-worker: 2
         tp: 4
@@ -5082,13 +4601,12 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Mid curve (4 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
+      recipe: "gb200-fp4/1k1k/mid-curve.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb200-fp4/1k1k/mid-curve.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5098,13 +4616,12 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Max throughput (4 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048, 4096 ]
+      recipe: "gb200-fp4/1k1k/max-tpt.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb200-fp4/1k1k/max-tpt.yaml"
       decode:
         num-worker: 1
         tp: 48
@@ -5118,13 +4635,12 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Low latency (1 prefill node, 4 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8 ]
+      recipe: "gb200-fp4/8k1k/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb200-fp4/8k1k/low-latency.yaml"
       decode:
         num-worker: 4
         tp: 4
@@ -5134,13 +4650,12 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Mid curve (6 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096 ]
+      recipe: "gb200-fp4/8k1k/mid-curve.yaml"
       prefill:
         num-worker: 6
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb200-fp4/8k1k/mid-curve.yaml"
       decode:
         num-worker: 1
         tp: 48
@@ -5150,13 +4665,12 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Max throughput (10 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048 ]
+      recipe: "gb200-fp4/8k1k/max-tpt.yaml"
       prefill:
         num-worker: 10
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb200-fp4/8k1k/max-tpt.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5179,14 +4693,12 @@ dsr1-fp4-gb300-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [3226]
+      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 4
@@ -5194,14 +4706,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
+      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5209,14 +4719,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [5]
+      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5224,14 +4732,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8, 12, 24, 48]
+      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5239,14 +4745,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [2253]
+      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5254,14 +4758,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
+      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5269,84 +4771,72 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [5]
+      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [12, 48, 96, 192]
+      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [8192]
+      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [1229]
+      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [4301]
+      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [2253]
+      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5358,14 +4848,12 @@ dsr1-fp4-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [33]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -5373,14 +4861,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [5]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5388,14 +4874,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [12, 24]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5403,14 +4887,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [180]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 4
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5418,14 +4900,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [308]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 8
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5433,14 +4913,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2253]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 10
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -5448,14 +4926,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 10
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5463,14 +4939,12 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1127]
+      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml"
       prefill:
         num-worker: 13
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5478,112 +4952,96 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [72]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [5]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [12]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [5, 15, 30]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [666]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [1229]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 9
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [3228]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 11
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
     - conc-list: [2253]
+      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 14
         tp: 2
         ep: 2
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5607,13 +5065,12 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Low latency (1 prefill node, 2 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32 ]
+      recipe: "gb300-fp4/1k1k/low_latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb300-fp4/1k1k/low_latency.yaml"
       decode:
         num-worker: 2
         tp: 4
@@ -5623,13 +5080,12 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Mid curve (4 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
+      recipe: "gb300-fp4/1k1k/mid_curve.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb300-fp4/1k1k/mid_curve.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5639,13 +5095,12 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Max throughput (4 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
+      recipe: "gb300-fp4/1k1k/max_tpt.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb300-fp4/1k1k/max_tpt.yaml"
       decode:
         num-worker: 1
         tp: 48
@@ -5659,13 +5114,12 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Low latency (1 prefill node, 4 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32, 64 ]
+      recipe: "gb300-fp4/8k1k/low_latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb300-fp4/8k1k/low_latency.yaml"
       decode:
         num-worker: 4
         tp: 4
@@ -5675,13 +5129,12 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Mid curve (6 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096 ]
+      recipe: "gb300-fp4/8k1k/mid_curve.yaml"
       prefill:
         num-worker: 6
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb300-fp4/8k1k/mid_curve.yaml"
       decode:
         num-worker: 1
         tp: 48
@@ -5691,13 +5144,12 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Max throughput (10 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048 ]
+      recipe: "gb300-fp4/8k1k/max_tpt.yaml"
       prefill:
         num-worker: 10
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/gb300-fp4/8k1k/max_tpt.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5720,14 +5172,12 @@ dsr1-fp8-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5735,14 +5185,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [24]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5750,14 +5198,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [180]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5765,14 +5211,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [564]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5780,14 +5224,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5795,14 +5237,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2253]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5810,14 +5250,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [8192]
+      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -5825,98 +5263,84 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     # STP configurations (no spec_decoding)
     - conc-list: [4]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [24]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [84]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [1229]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [2253]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [8602]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [12288]
+      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -5928,14 +5352,12 @@ dsr1-fp8-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5943,14 +5365,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [24]
+      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -5958,14 +5378,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [333]
+      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 6
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -5973,14 +5391,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
+      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 8
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -5988,14 +5404,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
+      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 10
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -6003,14 +5417,12 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
+      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -6018,98 +5430,84 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     # STP configurations (no spec_decoding)
     - conc-list: [4]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [24]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [36]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [512]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml"
       prefill:
         num-worker: 6
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [666]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml"
       prefill:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [1229]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [2151]
+      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
-        - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -6402,13 +5800,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: Low latency (1 prefill, 9 decode, TEP)
     - spec-decoding: "none"
       conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
+      recipe: "h200/1k1k/low-latency-1p9d.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml"
       decode:
         num-worker: 9
         tp: 8
@@ -6417,13 +5814,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput TEP (1 prefill, 6 decode)
     - spec-decoding: "none"
       conc-list: [512, 1024, 2048]
+      recipe: "h200/1k1k/bs256-1p6d-tp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6432,13 +5828,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput DEP (1 prefill, 6 decode, dp-attention)
     - spec-decoding: "none"
       conc-list: [128, 256, 512, 1024, 2048]
+      recipe: "h200/1k1k/bs256-1p6d-dep.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6447,13 +5842,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: Low latency (1 prefill, 9 decode, TEP)
     - spec-decoding: "mtp"
       conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
+      recipe: "h200/1k1k/low-latency-1p9d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d-mtp.yaml"
       decode:
         num-worker: 9
         tp: 8
@@ -6462,13 +5856,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput TEP (1 prefill, 6 decode)
     - spec-decoding: "mtp"
       conc-list: [512, 1024, 2048]
+      recipe: "h200/1k1k/bs256-1p6d-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6477,13 +5870,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [128, 256, 512, 1024, 2048]
+      recipe: "h200/1k1k/bs256-1p6d-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6495,13 +5887,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: Low latency TEP (1 prefill, 7 decode)
     - spec-decoding: "none"
       conc-list: [1, 4, 8]
+      recipe: "h200/8k1k/bs4-1p7d.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -6510,13 +5901,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (1 prefill, 6 decode)
     - spec-decoding: "none"
       conc-list: [4, 8, 16]
+      recipe: "h200/8k1k/bs8-1p6d.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6525,13 +5915,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (1 prefill, 3 decode)
     - spec-decoding: "none"
       conc-list: [8, 16, 32]
+      recipe: "h200/8k1k/bs16-1p3d.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -6540,13 +5929,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (2 prefill, 3 decode)
     - spec-decoding: "none"
       conc-list: [32, 64, 128]
+      recipe: "h200/8k1k/bs64-2p3d.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -6555,13 +5943,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "none"
       conc-list: [64, 128, 256]
+      recipe: "h200/8k1k/bs128-1p1d-dep.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -6570,13 +5957,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: Low latency TEP (1 prefill, 7 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 4, 8]
+      recipe: "h200/8k1k/bs4-1p7d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d-mtp.yaml"
       decode:
         num-worker: 7
         tp: 8
@@ -6585,13 +5971,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (1 prefill, 6 decode)
     - spec-decoding: "mtp"
       conc-list: [2, 4, 8, 16, 32]
+      recipe: "h200/8k1k/bs8-1p6d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d-mtp.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6600,13 +5985,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (1 prefill, 3 decode)
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 32, 64]
+      recipe: "h200/8k1k/bs16-1p3d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d-mtp.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -6615,13 +5999,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (2 prefill, 3 decode)
     - spec-decoding: "mtp"
       conc-list: [32, 64, 128]
+      recipe: "h200/8k1k/bs64-2p3d-mtp.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d-mtp.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -6630,13 +6013,12 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [32, 64, 128, 256, 512]
+      recipe: "h200/8k1k/bs128-1p1d-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -6658,52 +6040,48 @@ dsr1-fp4-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [16, 128]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [32, 64, 256]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]"
       decode:
         num-worker: 6
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [512]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [512]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]"
       decode:
         num-worker: 2
         tp: 8
@@ -6714,65 +6092,60 @@ dsr1-fp4-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [64, 128]
+      recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [8]
+      recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [4, 128]
+      recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [4, 8, 16, 64]
+      recipe: "b200-fp4/8k1k.yaml:override_stp_tp4"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_tp4"
       decode:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [1024, 2048]
+      recipe: "b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d"
       decode:
         num-worker: 2
         tp: 8
@@ -6794,52 +6167,48 @@ dsr1-fp8-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [4]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [16, 32, 64, 128, 256]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]"
       decode:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [1024, 2048, 4096]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]"
       decode:
         num-worker: 5
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [2048, 4096]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]"
       decode:
         num-worker: 5
         tp: 8
@@ -6850,42 +6219,36 @@ dsr1-fp8-b200-dynamo-sglang:
     search-space:
     # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat
     - conc-list: [128]
+      recipe: "b200-fp8/8k1k_stp_lowlat_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_0.yaml"
       decode:
         num-worker: 3
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [128]
+      recipe: "b200-fp8/8k1k_stp_lowlat_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_1.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 1
         dp-attn: false
     - conc-list: [8, 16, 32, 64, 128]
+      recipe: "b200-fp8/8k1k_stp_lowlat_2.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_2.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -6893,56 +6256,48 @@ dsr1-fp8-b200-dynamo-sglang:
         dp-attn: false
     # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt
     - conc-list: [288]
+      recipe: "b200-fp8/8k1k_stp_maxtpt_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml"
       decode:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [160, 288]
+      recipe: "b200-fp8/8k1k_stp_maxtpt_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [512]
+      recipe: "b200-fp8/8k1k_stp_maxtpt_2.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [1024]
+      recipe: "b200-fp8/8k1k_stp_maxtpt_3.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -6965,13 +6320,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: 1P1D
     - spec-decoding: "mtp"
       conc-list: [4, 64]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]"
       decode:
         num-worker: 1
         tp: 8
@@ -6980,13 +6334,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: 1P3D
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 32, 128]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: false
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]"
       decode:
         num-worker: 3
         tp: 8
@@ -6995,13 +6348,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 1P5D
     - spec-decoding: "mtp"
       conc-list: [512, 4096]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       decode:
         num-worker: 5
         tp: 8
@@ -7010,13 +6362,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 2P5D
     - spec-decoding: "mtp"
       conc-list: [1024, 2048, 4096]
+      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]"
       prefill:
         num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]"
       decode:
         num-worker: 5
         tp: 8
@@ -7025,13 +6376,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 1P2D
     - spec-decoding: "mtp"
       conc-list: [512, 1024, 2048]
+      recipe: "b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d"
       decode:
         num-worker: 2
         tp: 8
@@ -7043,14 +6393,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat
     - spec-decoding: "mtp"
       conc-list: [128]
+      recipe: "b200-fp8/8k1k_mtp_lowlat_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml"
       decode:
         num-worker: 3
         tp: 8
@@ -7058,14 +6406,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [128]
+      recipe: "b200-fp8/8k1k_mtp_lowlat_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml"
       decode:
         num-worker: 4
         tp: 8
@@ -7073,14 +6419,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8, 16, 32, 64, 128]
+      recipe: "b200-fp8/8k1k_mtp_lowlat_2.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml"
       decode:
         num-worker: 6
         tp: 8
@@ -7089,14 +6433,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt
     - spec-decoding: "mtp"
       conc-list: [288]
+      recipe: "b200-fp8/8k1k_mtp_maxtpt_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml"
       decode:
         num-worker: 2
         tp: 8
@@ -7104,14 +6446,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [160, 288]
+      recipe: "b200-fp8/8k1k_mtp_maxtpt_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -7119,14 +6459,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
+      recipe: "b200-fp8/8k1k_mtp_maxtpt_2.yaml"
       prefill:
         num-worker: 2
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -7134,14 +6472,12 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1024]
+      recipe: "b200-fp8/8k1k_mtp_maxtpt_3.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 1
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml
-        - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -7163,14 +6499,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [16, 512]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
       decode:
         num-worker: 5
         tp: 8
@@ -7178,14 +6512,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32, 64, 256, 512]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
       decode:
         num-worker: 6
         tp: 8
@@ -7193,14 +6525,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [512, 1024]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
       decode:
         num-worker: 1
         tp: 8
@@ -7208,14 +6538,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
+      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       decode:
         num-worker: 2
         tp: 8
@@ -7229,14 +6557,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [64, 128]
+      recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]"
       decode:
         num-worker: 1
         tp: 8
@@ -7244,14 +6570,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
+      recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]"
       decode:
         num-worker: 5
         tp: 8
@@ -7259,14 +6583,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4, 128]
+      recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]"
       decode:
         num-worker: 5
         tp: 8
@@ -7274,14 +6596,12 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 64]
+      recipe: "b200-fp4/8k1k.yaml:override_mtp_tp4"
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml
-        - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4"
       decode:
         num-worker: 1
         tp: 8
@@ -7303,98 +6623,84 @@ kimik2.5-fp4-gb200-dynamo-trt:
     search-space:
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4, 192, 360, 668 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [ 5, 15, 30, 55 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [ 666 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [ 2253 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
         ep: 32
         dp-attn: true
     - conc-list: [ 4301, 6452 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [ 4301 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [ 4301 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 32
@@ -7406,98 +6712,84 @@ kimik2.5-fp4-gb200-dynamo-trt:
     search-space:
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: false
     - conc-list: [ 156 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [ 5, 15, 30, 60, 105 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [ 333 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [ 615 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [ 2151 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [ 2253 ]
+      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
-        - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -7518,28 +6810,24 @@ kimik2.5-fp4-gb200-dynamo-vllm:
     osl: 1024
     search-space:
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
+      recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
-        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [4, 8, 16, 32, 64, 128]
+      recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
-        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
       decode:
         num-worker: 4
         tp: 4
@@ -7549,56 +6837,48 @@ kimik2.5-fp4-gb200-dynamo-vllm:
     osl: 1024
     search-space:
     - conc-list: [4, 8, 16, 32, 128]
+      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
       prefill:
         num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
-        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
       decode:
         num-worker: 4
         tp: 4
         ep: 4
         dp-attn: false
     - conc-list: [512, 1024]
+      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
-        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
     - conc-list: [2048]
+      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml"
       prefill:
         num-worker: 5
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
-        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
     - conc-list: [3072, 4096]
+      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 6
         tp: 4
         ep: 4
         dp-attn: true
-        additional-settings:
-        # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
-        - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -7625,13 +6905,12 @@ dsv4-fp4-gb200-dynamo-vllm:
     # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
     # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
     - conc-list: [1, 4, 8, 16, 32, 64]
+      recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -7640,13 +6919,12 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [128, 256, 1024, 2048, 4096]
+      recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -7656,13 +6934,12 @@ dsv4-fp4-gb200-dynamo-vllm:
     # The 4096 overlap with the 1p1d block gives a crossover point. 8192
     # would saturate 1p1d's prefill, so this topology takes over there.
     - conc-list: [4096, 8192]
+      recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -7675,13 +6952,12 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
     # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
     - conc-list: [1, 4, 8, 16, 32, 64]
+      recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       decode:
         num-worker: 1
         tp: 8
@@ -7689,13 +6965,12 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: false
     # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
     - conc-list: [512, 1024]
+      recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
@@ -7704,13 +6979,12 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
     # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
     - conc-list: [4096, 8192]
+      recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 7
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 75036a986..b6b6a30f3 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -77,6 +77,11 @@ on:
         required: false
         type: string
         default: "[]"
+      recipe:
+        description: "Path under benchmarks/multi_node/srt-slurm-recipes/ identifying the srt-slurm recipe to dispatch. May carry an `:override[N]` suffix. Empty for non-srt-slurm multi-node configs."
+        required: false
+        type: string
+        default: ""
       run-eval:
         type: boolean
         required: false
@@ -165,6 +170,7 @@ jobs:
         env:
           RUNNER_NAME: ${{ runner.name }}
           RUNNER_TYPE: ${{ inputs.runner }}
+          RECIPE: ${{ inputs.recipe }}
           # Hash uniquely on {EXP_NAME}_{PRECISION}_{FRAMEWORK}_prefill-tp{}-ep{}-dp{}-nw{}_decode-tp{}-ep{}-dp{}-nw{}_disagg-{}_spec-{}_conc{}_{runner}
           RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_prefill-tp${{ env.PREFILL_TP }}-ep${{ env.PREFILL_EP }}-dp${{ env.PREFILL_DP_ATTN }}-nw${{ env.PREFILL_NUM_WORKERS }}_decode-tp${{ env.DECODE_TP }}-ep${{ env.DECODE_EP }}-dp${{ env.DECODE_DP_ATTN }}-nw${{ env.DECODE_NUM_WORKERS }}_disagg-${{ env.DISAGG }}_spec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }}
         run: |
@@ -173,6 +179,23 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }}
+          # Resolve `recipe` (path relative to benchmarks/multi_node/srt-slurm-recipes/,
+          # optionally ending in `:override[N]`) into an absolute CONFIG_FILE for the
+          # launcher. Copy the recipe to a scratch path first so the launcher's
+          # `sed -i` rewrites (job name, health-check timeout, ...) don't mutate the
+          # tracked file in-place between runs.
+          if [[ -n "$RECIPE" ]]; then
+            recipe_path="${RECIPE%%:*}"
+            recipe_suffix=""
+            if [[ "$RECIPE" == *:* ]]; then
+              recipe_suffix=":${RECIPE#*:}"
+            fi
+            src="${GITHUB_WORKSPACE}/benchmarks/multi_node/srt-slurm-recipes/${recipe_path}"
+            scratch_dir="$(mktemp -d)"
+            scratch_recipe="${scratch_dir}/$(basename "$recipe_path")"
+            cp "$src" "$scratch_recipe"
+            export CONFIG_FILE="${scratch_recipe}${recipe_suffix}"
+          fi
           export IS_MULTINODE=true
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           if [ "${{ inputs.eval-only }}" = "true" ]; then
diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 74d4889f3..f8961f7b4 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -102,6 +102,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            recipe: ${{ matrix.config.recipe }}
             run-eval: false
             ref: ${{ inputs.ref }}
 
@@ -141,6 +142,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            recipe: ${{ matrix.config.recipe }}
             run-eval: true
             eval-only: true
             eval-conc: ${{ matrix.config.eval-conc }}
diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
index fd1fa91be..4dea7065a 100644
--- a/.github/workflows/run-sweep.yml
+++ b/.github/workflows/run-sweep.yml
@@ -138,6 +138,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            recipe: ${{ matrix.config.recipe }}
             run-eval: false
 
     sweep-multi-node-8k1k:
@@ -257,6 +258,7 @@ jobs:
             decode-ep: ${{ matrix.config.decode.ep }}
             decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
             decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+            recipe: ${{ matrix.config.recipe }}
             run-eval: true
             eval-only: true
             eval-conc: ${{ matrix.config.eval-conc }}
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml
new file mode 100644
index 000000000..b08193bcb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml
@@ -0,0 +1,259 @@
+# B200-FP4 1k1k — STP and MTP in one file
+#
+# Two inference modes distinguished by override key names:
+#   zip_override_stp_*  — standard token prediction (no speculative decoding)
+#   zip_override_mtp_*  — multi-token prediction (EAGLE speculative decoding)
+#
+# Low-latency variants: tep8 decode (DP=1), dep4 prefill (DP=4 TP=4)
+# Max-throughput variants: dep8 decode (DP=8), adds SGLANG_MOE_NVFP4_DISPATCH
+#
+# Note: max-tpt 1d has max-running-requests=1024; max-tpt 2d keeps 512.
+#       MTP max-tpt 1d additionally uses mem-fraction=0.75 for decode.
+#
+# Usage:
+#   srtctl apply  -f recipes/b200-fp4/1k1k.yaml                              # all 8 variants
+#   srtctl apply  -f recipes/b200-fp4/1k1k.yaml:*stp*                        # all STP variants
+#   srtctl apply  -f recipes/b200-fp4/1k1k.yaml:*mtp*                        # all MTP variants
+#   srtctl apply  -f recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]   # STP 1p5d only
+#   srtctl dry-run -f recipes/b200-fp4/1k1k.yaml                             # preview
+
+base:
+  name: "b200-fp4-stp-1k1k"
+
+  model:
+    path: "dsr1"
+    container: "dynamo-sglang"
+    precision: "fp4"
+
+  resources:
+    gpu_type: "b200"
+    prefill_nodes: 1
+    prefill_workers: 1
+    gpus_per_prefill: 4
+    decode_nodes: 5
+    decode_workers: 5
+    gpus_per_node: 8
+
+  backend:
+    prefill_environment:
+      TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+      PYTHONUNBUFFERED: "1"
+      DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+      SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+      SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+      SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+      MC_FORCE_MNNVL: "1"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+      DYN_REQUEST_PLANE: nats
+    decode_environment:
+      TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+      PYTHONUNBUFFERED: "1"
+      DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+      SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+      SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+      SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+      MC_FORCE_MNNVL: "1"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+      DYN_REQUEST_PLANE: nats
+    sglang_config:
+      prefill:
+        # Model configuration
+        served-model-name: "deepseek-ai/DeepSeek-R1"
+        trust-remote-code: true
+        quantization: "modelopt_fp4"
+
+        # Disaggregation mode
+        disaggregation-mode: "prefill"
+        disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+        mem-fraction-static: 0.85
+        max-prefill-tokens: 32768
+        chunked-prefill-size: 32768
+        context-length: 2200
+        max-running-requests: 512
+        disable-cuda-graph: true
+
+        # Parallelism
+        tensor-parallel-size: 4
+        data-parallel-size: 4
+        expert-parallel-size: 4
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+
+        # Attention
+        attention-backend: "trtllm_mla"
+        kv-cache-dtype: "fp8_e4m3"
+
+        # MoE
+        moe-runner-backend: "flashinfer_trtllm"
+        moe-dense-tp-size: 1
+
+        # Other flags
+        stream-interval: 30
+        watchdog-timeout: 1000000
+        enable-flashinfer-allreduce-fusion: true
+        disable-radix-cache: true
+
+      decode:
+        # Model configuration
+        served-model-name: "deepseek-ai/DeepSeek-R1"
+        trust-remote-code: true
+        quantization: "modelopt_fp4"
+
+        # Disaggregation mode
+        disaggregation-mode: "decode"
+        disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+        mem-fraction-static: 0.85
+        max-prefill-tokens: 32768
+        chunked-prefill-size: 32768
+        context-length: 2200
+        max-running-requests: 512
+        cuda-graph-max-bs: 512
+
+        # Parallelism
+        tensor-parallel-size: 8
+        data-parallel-size: 1
+        expert-parallel-size: 8
+
+        # Attention
+        attention-backend: "trtllm_mla"
+        kv-cache-dtype: "fp8_e4m3"
+
+        # MoE
+        moe-runner-backend: "flashinfer_trtllm"
+
+        # Other flags
+        stream-interval: 30
+        watchdog-timeout: 1000000
+        enable-flashinfer-allreduce-fusion: true
+        disable-radix-cache: true
+
+  health_check:
+    max_attempts: 360
+    interval_seconds: 10
+
+  benchmark:
+    type: "sa-bench"
+    isl: 1024
+    osl: 1024
+    req_rate: "inf"
+
+
+# STP low-latency: tep8 decode (DP=1), scale sweep 1p5d and 1p6d
+zip_override_stp_lowlat:
+  name:
+    - "b200-fp4-stp-low-latency-dep4-1p-tep8-5d"
+    - "b200-fp4-stp-low-latency-dep4-1p-tep8-6d"
+  resources:
+    decode_nodes: [5, 6]
+    decode_workers: [5, 6]
+  benchmark:
+    concurrencies: ["16x128", "32x64x256"]
+
+
+# MTP low-latency: same scales as STP, adds EAGLE speculative decoding + fp4-gemm-backend
+zip_override_mtp_lowlat:
+  name:
+    - "b200-fp4-mtp-low-latency-dep4-1p-tep8-5d"
+    - "b200-fp4-mtp-low-latency-dep4-1p-tep8-6d"
+  resources:
+    decode_nodes: [5, 6]
+    decode_workers: [5, 6]
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        fp4-gemm-backend: "flashinfer_trtllm"
+      decode:
+        fp4-gemm-backend: "flashinfer_trtllm"
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: ["16x512", "32x64x256x512"]
+
+
+# STP max-throughput: dep8 decode (DP=8), scale sweep 1p1d and 1p2d
+# Adds SGLANG_MOE_NVFP4_DISPATCH + SGLANG_FLASHINFER_FP4_GEMM_BACKEND env vars
+# 1d: max-running-requests=1024; 2d: keeps 512
+zip_override_stp_maxtpt:
+  name:
+    - "b200-fp4-stp-max-tpt-dep4-1p-dep8-1d"
+    - "b200-fp4-stp-max-tpt-dep4-1p-dep8-2d"
+  resources:
+    decode_nodes: [1, 2]
+    decode_workers: [1, 2]
+  backend:
+    decode_environment:
+      SGLANG_MOE_NVFP4_DISPATCH: "1"
+      SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
+    sglang_config:
+      prefill:
+        max-running-requests: [1024, 512]
+      decode:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: [1024, 512]
+        cuda-graph-max-bs: [1024, 512]
+  benchmark:
+    concurrencies: ["512", "512"]
+
+
+# MTP max-throughput: dep8 decode, scale sweep 1p1d and 1p2d, adds EAGLE speculative decoding
+# Adds SGLANG_MOE_NVFP4_DISPATCH + SGLANG_FLASHINFER_FP4_GEMM_BACKEND + fp4-gemm-backend
+# 1d: max-running-requests=1024, mem-fraction=0.75 for decode; 2d: keeps 512/0.85
+zip_override_mtp_maxtpt:
+  name:
+    - "b200-fp4-mtp-max-tpt-dep4-1p-dep8-1d"
+    - "b200-fp4-mtp-max-tpt-dep4-1p-dep8-2d"
+  resources:
+    decode_nodes: [1, 2]
+    decode_workers: [1, 2]
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_MOE_NVFP4_DISPATCH: "1"
+      SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        fp4-gemm-backend: "flashinfer_trtllm"
+        max-running-requests: [1024, 512]
+      decode:
+        fp4-gemm-backend: "flashinfer_trtllm"
+        mem-fraction-static: [0.75, 0.85]
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: [1024, 512]
+        cuda-graph-max-bs: [1024, 512]
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: ["512x1024", "512"]
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml
new file mode 100644
index 000000000..f5bfc9641
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml
@@ -0,0 +1,351 @@
+# B200-FP4 8k1k — STP and MTP in one file
+#
+# Three modes distinguished by override key names:
+#   override_stp_tp4 / override_mtp_tp4:  TP4 prefill (DP=1, EP=1) — low-latency single-node
+#   zip_override_stp_lowlat / zip_override_mtp_lowlat:  dep4 prefill + tep8 decode (DP=1)
+#   override_stp_maxtpt_7p2d / override_mtp_maxtpt_7p2d:  dep4 prefill + dep8 decode, 7p2d
+#   override_mtp_maxtpt_4p1d:  MTP-only 4p1d, no frontends, env-var FP4 backend
+#
+# Usage:
+#   srtctl apply  -f recipes/b200-fp4/8k1k.yaml                              # all 11 variants
+#   srtctl apply  -f recipes/b200-fp4/8k1k.yaml:*stp*                        # all STP variants
+#   srtctl apply  -f recipes/b200-fp4/8k1k.yaml:*mtp*                        # all MTP variants
+#   srtctl apply  -f recipes/b200-fp4/8k1k.yaml:override_stp_tp4             # STP tp4 only
+#   srtctl apply  -f recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]   # STP 1p1d only
+#   srtctl dry-run -f recipes/b200-fp4/8k1k.yaml                             # preview
+
+base:
+  name: "b200-fp4-stp-8k1k"
+
+  dynamo:
+    version: 0.8.1
+
+  model:
+    path: "dsr1"
+    container: "dynamo-sglang"
+    precision: "fp4"
+
+  frontend:
+    type: dynamo
+    enable_multiple_frontends: true
+    num_additional_frontends: 4
+
+  resources:
+    gpu_type: "b200"
+    prefill_nodes: 1
+    prefill_workers: 1
+    gpus_per_prefill: 4
+    decode_nodes: 1
+    decode_workers: 1
+    gpus_per_node: 8
+
+  backend:
+    prefill_environment:
+      TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+      PYTHONUNBUFFERED: "1"
+      DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+      SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+      SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+      SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+      MC_FORCE_MNNVL: "1"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+      DYN_REQUEST_PLANE: nats
+    decode_environment:
+      TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+      PYTHONUNBUFFERED: "1"
+      DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+      SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+      SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+      SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+      MC_FORCE_MNNVL: "1"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+      DYN_REQUEST_PLANE: nats
+    sglang_config:
+      prefill:
+        # Model configuration
+        served-model-name: "deepseek-ai/DeepSeek-R1"
+        trust-remote-code: true
+        quantization: "modelopt_fp4"
+
+        # Disaggregation mode
+        disaggregation-mode: "prefill"
+        disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+        mem-fraction-static: 0.85
+        max-prefill-tokens: 32768
+        chunked-prefill-size: 32768
+        context-length: 9600
+        max-running-requests: 512
+        disable-cuda-graph: true
+
+        # Parallelism
+        tensor-parallel-size: 4
+        data-parallel-size: 4
+        expert-parallel-size: 4
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+
+        # Attention
+        attention-backend: "trtllm_mla"
+        kv-cache-dtype: "fp8_e4m3"
+
+        # MoE
+        moe-runner-backend: "flashinfer_trtllm"
+        moe-dense-tp-size: 1
+        fp4-gemm-backend: "flashinfer_trtllm"
+
+        # Other flags
+        stream-interval: 30
+        watchdog-timeout: 1000000
+        enable-flashinfer-allreduce-fusion: true
+        disable-radix-cache: true
+
+      decode:
+        # Model configuration
+        served-model-name: "deepseek-ai/DeepSeek-R1"
+        trust-remote-code: true
+        quantization: "modelopt_fp4"
+
+        # Disaggregation mode
+        disaggregation-mode: "decode"
+        disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+        mem-fraction-static: 0.85
+        max-prefill-tokens: 32768
+        chunked-prefill-size: 32768
+        context-length: 9600
+        max-running-requests: 512
+        cuda-graph-max-bs: 512
+
+        # Parallelism
+        tensor-parallel-size: 8
+        data-parallel-size: 1
+        expert-parallel-size: 8
+
+        # Attention
+        attention-backend: "trtllm_mla"
+        kv-cache-dtype: "fp8_e4m3"
+
+        # MoE
+        moe-runner-backend: "flashinfer_trtllm"
+        fp4-gemm-backend: "flashinfer_trtllm"
+
+        # Other flags
+        stream-interval: 30
+        watchdog-timeout: 1000000
+        enable-flashinfer-allreduce-fusion: true
+        disable-radix-cache: true
+
+  health_check:
+    max_attempts: 360
+    interval_seconds: 10
+
+  benchmark:
+    type: "sa-bench"
+    isl: 8192
+    osl: 1024
+    req_rate: "inf"
+
+
+# STP TP4 prefill mode: TP4 (DP=1, EP=1) instead of dep4 — low-latency single-node
+override_stp_tp4:
+  name: "b200-fp4-stp-low-latency-tp4-1p-tp8-1d"
+  frontend:
+    num_additional_frontends: 2
+  backend:
+    sglang_config:
+      prefill:
+        data-parallel-size: 1
+        expert-parallel-size: 1
+        enable-dp-attention: null
+        enable-dp-lm-head: null
+      decode:
+        expert-parallel-size: 1
+  benchmark:
+    concurrencies: "4x8x16x64"
+
+
+# MTP TP4 prefill mode: same as STP tp4 but adds EAGLE speculative decoding
+override_mtp_tp4:
+  name: "b200-fp4-mtp-low-latency-tp4-1p-tp8-1d"
+  frontend:
+    num_additional_frontends: 2
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        data-parallel-size: 1
+        expert-parallel-size: 1
+        enable-dp-attention: null
+        enable-dp-lm-head: null
+      decode:
+        expert-parallel-size: 1
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: "4x8x16x64"
+
+
+# STP low-latency: dep4 prefill + tep8 decode (DP=1), scale sweep 1p1d/1p5d/2p5d
+zip_override_stp_lowlat:
+  name:
+    - "b200-fp4-stp-low-latency-dep4-1p-tep8-1d"
+    - "b200-fp4-stp-low-latency-dep4-1p-tep8-5d"
+    - "b200-fp4-stp-low-latency-dep4-2p-tep8-5d"
+  resources:
+    prefill_nodes: [1, 1, 2]
+    prefill_workers: [1, 1, 2]
+    decode_nodes: [1, 5, 5]
+    decode_workers: [1, 5, 5]
+  benchmark:
+    concurrencies: ["64x128", "8", "4x128"]
+
+
+# MTP low-latency: same scales as STP, adds EAGLE speculative decoding
+zip_override_mtp_lowlat:
+  name:
+    - "b200-fp4-mtp-low-latency-dep4-1p-tep8-1d"
+    - "b200-fp4-mtp-low-latency-dep4-1p-tep8-5d"
+    - "b200-fp4-mtp-low-latency-dep4-2p-tep8-5d"
+  resources:
+    prefill_nodes: [1, 1, 2]
+    prefill_workers: [1, 1, 2]
+    decode_nodes: [1, 5, 5]
+    decode_workers: [1, 5, 5]
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      decode:
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: ["64x128", "8", "4x128"]
+
+
+# STP max-throughput 7p2d: dep4 prefill + dep8 decode, flashinfer_cutlass backend
+override_stp_maxtpt_7p2d:
+  name: "b200-fp4-stp-max-tpt-dep4-7p-dep8-2d"
+  resources:
+    prefill_nodes: 7
+    prefill_workers: 7
+    decode_nodes: 2
+    decode_workers: 2
+  backend:
+    decode_environment:
+      SGLANG_MOE_NVFP4_DISPATCH: "1"
+    sglang_config:
+      prefill:
+        max-prefill-tokens: 65536
+        chunked-prefill-size: 65536
+        max-running-requests: 1024
+        fp4-gemm-backend: "flashinfer_cutlass"
+      decode:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: 2048
+        cuda-graph-max-bs: 1024
+        fp4-gemm-backend: "flashinfer_cutlass"
+  benchmark:
+    concurrencies: "1024x2048"
+
+
+# MTP max-throughput 7p2d: same as STP but adds EAGLE speculative decoding
+override_mtp_maxtpt_7p2d:
+  name: "b200-fp4-mtp-max-tpt-dep4-7p-dep8-2d"
+  resources:
+    prefill_nodes: 7
+    prefill_workers: 7
+    decode_nodes: 2
+    decode_workers: 2
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_MOE_NVFP4_DISPATCH: "1"
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        max-prefill-tokens: 65536
+        chunked-prefill-size: 65536
+        max-running-requests: 1024
+        fp4-gemm-backend: "flashinfer_cutlass"
+      decode:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: 2048
+        cuda-graph-max-bs: 1024
+        fp4-gemm-backend: "flashinfer_cutlass"
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: "1024x2048"
+
+
+# MTP-only: 4p1d, no frontends, SGLANG_FLASHINFER_FP4_GEMM_BACKEND env var (fp4-gemm-backend: null
+# removes the sglang_config key), mem-fraction=0.75 for decode
+override_mtp_maxtpt_4p1d:
+  name: "b200-fp4-mtp-max-tpt-dep4-4p-dep8-1d"
+  dynamo: null
+  frontend: null
+  resources:
+    prefill_nodes: 4
+    prefill_workers: 4
+    decode_nodes: 1
+    decode_workers: 1
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_MOE_NVFP4_DISPATCH: "1"
+      SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        max-running-requests: 1024
+        fp4-gemm-backend: null
+      decode:
+        mem-fraction-static: 0.75
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: 1024
+        cuda-graph-max-bs: 1024
+        fp4-gemm-backend: null
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: "1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml
new file mode 100644
index 000000000..7489586aa
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml
@@ -0,0 +1,281 @@
+# B200-FP8 1k1k — STP and MTP in one file
+#
+# Two inference modes distinguished by override key names:
+#   zip_override_stp_*  — standard token prediction (no speculative decoding)
+#   zip_override_mtp_*  — multi-token prediction (EAGLE speculative decoding)
+#
+# Low-latency variants: tep8 decode (DP=1)
+# Max-throughput variants: dep8 decode (DP=8)
+#
+# Usage:
+#   srtctl apply  -f recipes/b200-fp8/1k1k.yaml                              # all 10 variants
+#   srtctl apply  -f recipes/b200-fp8/1k1k.yaml:*stp*                        # all STP variants
+#   srtctl apply  -f recipes/b200-fp8/1k1k.yaml:*mtp*                        # all MTP variants
+#   srtctl apply  -f recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]   # STP 1p1d only
+#   srtctl dry-run -f recipes/b200-fp8/1k1k.yaml                             # preview
+
+base:
+  name: "b200-fp8-stp-1k1k"
+
+  model:
+    path: "dsr1-fp8"
+    container: "dynamo-sglang"
+    precision: "fp8"
+
+  resources:
+    gpu_type: "b200"
+    prefill_nodes: 1
+    prefill_workers: 1
+    decode_nodes: 1
+    decode_workers: 1
+    gpus_per_node: 8
+
+  backend:
+    prefill_environment:
+      TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+      PYTHONUNBUFFERED: "1"
+      DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+      SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+      SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+      SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+      MC_FORCE_MNNVL: "1"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      DYN_REQUEST_PLANE: nats
+    decode_environment:
+      TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+      PYTHONUNBUFFERED: "1"
+      DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+      SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+      SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+      SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+      MC_FORCE_MNNVL: "1"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      DYN_REQUEST_PLANE: nats
+    sglang_config:
+      prefill:
+        # Model configuration
+        served-model-name: "deepseek-ai/DeepSeek-R1"
+        trust-remote-code: true
+        quantization: "fp8"
+
+        # Disaggregation mode
+        disaggregation-mode: "prefill"
+        disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+        mem-fraction-static: 0.85
+        max-prefill-tokens: 32768
+        chunked-prefill-size: 32768
+        context-length: 2200
+        max-running-requests: 512
+        disable-cuda-graph: true
+
+        # Parallelism
+        tensor-parallel-size: 8
+        data-parallel-size: 1
+        expert-parallel-size: 8
+
+        # Attention
+        attention-backend: "trtllm_mla"
+        kv-cache-dtype: "fp8_e4m3"
+
+        # MoE
+        moe-runner-backend: "flashinfer_trtllm"
+        # moe-dense-tp-size: 1
+
+        # Other flags
+        stream-interval: 30
+        watchdog-timeout: 1000000
+        enable-flashinfer-allreduce-fusion: true
+        disable-radix-cache: true
+
+      decode:
+        # Model configuration
+        served-model-name: "deepseek-ai/DeepSeek-R1"
+        trust-remote-code: true
+        quantization: "fp8"
+
+        # Disaggregation mode
+        disaggregation-mode: "decode"
+        disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+        mem-fraction-static: 0.85
+        max-prefill-tokens: 32768
+        chunked-prefill-size: 32768
+        context-length: 2200
+        max-running-requests: 512
+        cuda-graph-max-bs: 512
+
+        # Parallelism
+        tensor-parallel-size: 8
+        data-parallel-size: 1
+        expert-parallel-size: 8
+
+        # Attention
+        attention-backend: "trtllm_mla"
+        kv-cache-dtype: "fp8_e4m3"
+
+        # MoE
+        moe-runner-backend: "flashinfer_trtllm"
+        # moe-dense-tp-size: 1
+
+        # Other flags
+        stream-interval: 30
+        watchdog-timeout: 1000000
+        enable-flashinfer-allreduce-fusion: true
+        disable-radix-cache: true
+        # disable-chunked-prefix-cache: true
+
+  health_check:
+    max_attempts: 360
+    interval_seconds: 10
+
+  benchmark:
+    type: "sa-bench"
+    isl: 1024
+    osl: 1024
+    req_rate: "inf"
+
+
+# STP low-latency: tep8 decode (DP=1), scale sweep 1p1d and 1p3d
+zip_override_stp_lowlat:
+  name:
+    - "b200-fp8-stp-low-latency-tep8-1p-1d"
+    - "b200-fp8-stp-low-latency-tep8-1p-3d"
+  resources:
+    decode_nodes: [1, 3]
+    decode_workers: [1, 3]
+  benchmark:
+    concurrencies: ["4", "16x32x64x128x256"]
+
+
+# MTP low-latency: same scales as STP, adds EAGLE speculative decoding
+zip_override_mtp_lowlat:
+  name:
+    - "b200-fp8-mtp-low-latency-tep8-1p-1d"
+    - "b200-fp8-mtp-low-latency-tep8-1p-3d"
+  resources:
+    decode_nodes: [1, 3]
+    decode_workers: [1, 3]
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        moe-dense-tp-size: 1
+      decode:
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: ["4x64", "4x8x16x32x128"]
+
+
+# STP max-throughput: dep8 decode (DP=8), scale sweep 1p5d and 2p5d
+zip_override_stp_maxtpt:
+  name:
+    - "b200-fp8-stp-max-tpt-dep8-1p-5d"
+    - "b200-fp8-stp-max-tpt-dep8-2p-5d"
+  resources:
+    prefill_nodes: [1, 2]
+    prefill_workers: [1, 2]
+    decode_nodes: [5, 5]
+    decode_workers: [5, 5]
+  backend:
+    sglang_config:
+      prefill:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: 1024
+      decode:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        max-running-requests: 1024
+        cuda-graph-max-bs: 1024
+  benchmark:
+    concurrencies: ["1024", "2048"]
+
+
+# MTP max-throughput: dep8 decode, scale sweep 1p1d/1p5d/2p5d, adds EAGLE speculative decoding
+# Note: max-running-requests stays at 512 for MTP (unlike STP which raises to 1024)
+zip_override_mtp_maxtpt:
+  name:
+    - "b200-fp8-mtp-max-tpt-dep8-1p-1d"
+    - "b200-fp8-mtp-max-tpt-dep8-1p-5d"
+    - "b200-fp8-mtp-max-tpt-dep8-2p-5d"
+  resources:
+    prefill_nodes: [1, 1, 2]
+    prefill_workers: [1, 1, 2]
+    decode_nodes: [1, 5, 5]
+    decode_workers: [1, 5, 5]
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+      decode:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 2
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 3
+  benchmark:
+    concurrencies: ["512x1024x2048x4096", "512x4096", "1024x2048x4096"]
+
+
+# MTP special case: 1p2d uses speculative-num-steps=1 and draft-tokens=2 (vs 2/3 for all others)
+override_mtp_maxtpt_1p2d:
+  name: "b200-fp8-mtp-max-tpt-dep8-1p-2d"
+  resources:
+    decode_nodes: 2
+    decode_workers: 2
+  backend:
+    prefill_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    decode_environment:
+      SGLANG_ENABLE_SPEC_V2: "1"
+    sglang_config:
+      prefill:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+      decode:
+        data-parallel-size: 8
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+        moe-dense-tp-size: 1
+        speculative-algorithm: "EAGLE"
+        speculative-num-steps: 1
+        speculative-eagle-topk: 1
+        speculative-num-draft-tokens: 2
+  benchmark:
+    concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml
new file mode 100644
index 000000000..3c1f465fa
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml
@@ -0,0 +1,141 @@
+name: b200-fp8-mtp-low-latency-tep8-1p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '128'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml
new file mode 100644
index 000000000..51671712c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml
@@ -0,0 +1,141 @@
+name: b200-fp8-mtp-low-latency-tep8-1p-4d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '128'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml
new file mode 100644
index 000000000..27dbbe30d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml
@@ -0,0 +1,141 @@
+name: b200-fp8-mtp-low-latency-tep8-1p-6d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 22
+      cuda-graph-max-bs: 22
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: 8x16x32x64x128
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml
new file mode 100644
index 000000000..e5eefa2d2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml
@@ -0,0 +1,144 @@
+name: b200-fp8-mtp-max-tpt-dep8-1p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 2
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 128
+      cuda-graph-max-bs: 16
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '288'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml
new file mode 100644
index 000000000..fe0cd9a9f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml
@@ -0,0 +1,144 @@
+name: b200-fp8-mtp-max-tpt-dep8-1p-2d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 256
+      cuda-graph-max-bs: 32
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: 160x288
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml
new file mode 100644
index 000000000..7d050ff12
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml
@@ -0,0 +1,144 @@
+name: b200-fp8-mtp-max-tpt-dep8-2p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 64
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '512'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml
new file mode 100644
index 000000000..e687ccf84
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml
@@ -0,0 +1,144 @@
+name: b200-fp8-mtp-max-tpt-dep8-3p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 3
+  prefill_workers: 3
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+    SGLANG_ENABLE_SPEC_V2: '1'
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 1024
+      cuda-graph-max-bs: 128
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+      speculative-algorithm: EAGLE
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '1024'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml
new file mode 100644
index 000000000..894cef0c7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml
@@ -0,0 +1,139 @@
+name: b200-fp8-stp-low-latency-tp8-1p-3d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+        # disable-chunked-prefix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '128'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml
new file mode 100644
index 000000000..c05382ef8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml
@@ -0,0 +1,139 @@
+name: b200-fp8-stp-low-latency-tp8-1p-4d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 4
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+        # disable-chunked-prefix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '128'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml
new file mode 100644
index 000000000..69e36a289
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml
@@ -0,0 +1,139 @@
+name: b200-fp8-stp-low-latency-tp8-1p-6d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 22
+      cuda-graph-max-bs: 22
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+        # disable-chunked-prefix-cache: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: 8x16x32x64x128
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml
new file mode 100644
index 000000000..9846a1f05
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml
@@ -0,0 +1,140 @@
+name: b200-fp8-stp-max-tpt-dep8-1p-2d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 2
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '288'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml
new file mode 100644
index 000000000..e4eccdeab
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml
@@ -0,0 +1,140 @@
+name: b200-fp8-stp-max-tpt-dep8-1p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: 160x288
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml
new file mode 100644
index 000000000..c4cc2dd33
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml
@@ -0,0 +1,140 @@
+name: b200-fp8-stp-max-tpt-dep8-2p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '512'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml
new file mode 100644
index 000000000..59cbb8197
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml
@@ -0,0 +1,140 @@
+name: b200-fp8-stp-max-tpt-dep8-3p-1d
+
+dynamo:
+  version: 0.9.1
+
+model:
+  path: dsr1-fp8
+  container: dynamo-sglang
+  precision: fp8
+
+resources:
+  gpu_type: b200
+  prefill_nodes: 3
+  prefill_workers: 3
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: '1000'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    DYN_REQUEST_PLANE: nats
+    CUDA_SCALE_LAUNCH_QUEUES: 4x
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: '1'
+
+  sglang_config:
+    prefill:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      load-balance-method: round_robin
+
+        # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 8192
+      chunked-prefill-size: 65536
+      max-running-requests: 8
+      context-length: 9600
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+      moe-dense-tp-size: 1
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+
+    decode:
+        # Model configuration
+      served-model-name: deepseek-ai/DeepSeek-R1
+      trust-remote-code: true
+      quantization: fp8
+
+        # Disaggregation mode
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+
+        # Memory and token limits
+      mem-fraction-static: 0.75
+      context-length: 9600
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+
+        # Parallelism
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+        # Attention
+      attention-backend: trtllm_mla
+      kv-cache-dtype: fp8_e4m3
+
+        # MoE
+      moe-runner-backend: flashinfer_trtllm
+
+        # Other flags
+      stream-interval: 30
+      watchdog-timeout: 1000000
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-dense-tp-size: 1
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  req_rate: inf
+  concurrencies: '1024'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml
new file mode 100644
index 000000000..8729aa6fd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml
@@ -0,0 +1,116 @@
+name: "gb200-fp4-1k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_node: 4
+
+backend:
+
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      mem-fraction-static: 0.95
+      max-total-tokens: 8192
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
+      data-parallel-size: 1
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      mem-fraction-static: 0.95
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 256
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml
new file mode 100644
index 000000000..1075c93eb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml
@@ -0,0 +1,183 @@
+name: "gb200-fp4-1k1k-max-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 4
+  decode_nodes: 12
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutlass"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.84
+      max-total-tokens: 131072
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 65536
+      enable-single-batch-overlap: true
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 4
+      ep-size: 4
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 3122380
+      chunked-prefill-size: 786432
+
+      # Request handling
+      max-running-requests: 67584
+      enable-single-batch-overlap: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      # CUDA graphs (extensive batch size list)
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048x4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml
new file mode 100644
index 000000000..d8c80dea7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml
@@ -0,0 +1,182 @@
+name: "gb200-fp4-1k1k-mid-curve"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 4
+  decode_nodes: 8
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutlass"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.84
+      max-total-tokens: 131072
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 65536
+      enable-single-batch-overlap: true
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 4
+      ep-size: 4
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 3122380
+      chunked-prefill-size: 786432
+
+      # Request handling
+      max-running-requests: 67584
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      # CUDA graphs (extensive batch size list)
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x2048x4096x8192"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml
new file mode 100644
index 000000000..14ebda144
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml
@@ -0,0 +1,118 @@
+name: "gb200-fp4-8k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_node: 4
+
+backend:
+
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600 
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768 
+      chunked-prefill-size: 24576 
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      data-parallel-size: 1
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+ 
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 256
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: "4x8"
+  req_rate: 300 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml
new file mode 100644
index 000000000..cf2759871
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml
@@ -0,0 +1,179 @@
+name: "gb200-fp4-8k1k-max-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 10
+  decode_nodes: 8
+  prefill_workers: 10
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.95
+      max-total-tokens: 131072
+      max-prefill-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 1
+      ep-size: 1
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 524288
+      chunked-prefill-size: 24576
+
+      # Request handling
+      max-running-requests: 16384
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      cuda-graph-max-bs: 512
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: 700
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml
new file mode 100644
index 000000000..8380eb5bf
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml
@@ -0,0 +1,179 @@
+name: "gb200-fp4-8k1k-mid-curve"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1-fp4"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  decode_nodes: 12
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.95
+      max-total-tokens: 131072
+      max-prefill-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 1
+      ep-size: 1
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 524288
+      chunked-prefill-size: 24576
+
+      # Request handling
+      max-running-requests: 16384
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      cuda-graph-max-bs: 512
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x2048x4096"
+  req_rate: 700
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml
new file mode 100644
index 000000000..155d1664c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml
@@ -0,0 +1,121 @@
+name: "gb200-fp8-1k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_ENABLE_FLASHINFER_GEMM: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      disaggregation-mode: "prefill"
+      mem-fraction-static: 0.95
+      max-total-tokens: 8192
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 512
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      fp8-gemm-backend: "flashinfer_trtllm"
+      enable-symm-mem: true
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      disaggregation-mode: "decode"
+      mem-fraction-static: 0.95
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 128
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml
new file mode 100644
index 000000000..5d3c91794
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml
@@ -0,0 +1,175 @@
+name: "gb200-fp8-1k1k-max-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 4
+  prefill_workers: 2
+  decode_nodes: 8
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024x2048x4096x6144"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml
new file mode 100644
index 000000000..1f83ed1bd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml
@@ -0,0 +1,174 @@
+name: "gb200-fp8-1k1k-mid-curve"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  prefill_workers: 3
+  decode_nodes: 12
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+      disaggregation-transfer-backend: nixl
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024x2048x4096"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml
new file mode 100644
index 000000000..08fe2fa90
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml
@@ -0,0 +1,176 @@
+name: "gb200-fp8-1k1k-ultra-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "640"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 8192
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 5120
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640]
+      cuda-graph-max-bs: 640
+
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml
new file mode 100644
index 000000000..368b03409
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml
@@ -0,0 +1,117 @@
+name: "gb200-fp8-8k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      watchdog-timeout: 1000000
+      context-length: 9600 
+      disaggregation-mode: "prefill"
+      mem-fraction-static: 0.8
+      max-total-tokens: 32768 
+      chunked-prefill-size: 24576 
+      cuda-graph-max-bs: 512 
+      max-running-requests: 512
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      watchdog-timeout: 1000000
+      context-length: 9600 
+      disaggregation-mode: "decode"
+      mem-fraction-static: 0.8
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 512 
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: "4x8x16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml
new file mode 100644
index 000000000..f03e34b8d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml
@@ -0,0 +1,171 @@
+name: "gb200-8k1k-fp8-max-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 12
+  prefill_workers: 6
+  decode_nodes: 6
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.80
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 24
+      dp-size: 24
+      ep-size: 24
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 8192 
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512]
+      cuda-graph-max-bs: 512
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: "2048x4096x6144"
+  req_rate: "300"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml
new file mode 100644
index 000000000..c822d67f3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml
@@ -0,0 +1,170 @@
+name: "gb200-8k1k-fp8-mid-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 10 
+  prefill_workers: 5
+  decode_nodes: 8 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.80
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 8192 
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      # CUDA graphs
+      cuda-graph-max-bs: 256 
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: "512x1024x2048x6144"
+  req_rate: "300"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml
new file mode 100644
index 000000000..252eafa2b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml
@@ -0,0 +1,116 @@
+name: "gb300-fp4-low-latency-1k1k"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_node: 4
+
+backend:
+
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      mem-fraction-static: 0.95
+      max-total-tokens: 8192
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      data-parallel-size: 1
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      mem-fraction-static: 0.95
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 256
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml
new file mode 100644
index 000000000..c941651aa
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml
@@ -0,0 +1,184 @@
+name: "gb300-fp4-max-tpt-1k1k"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  decode_nodes: 12
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutlass"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.84
+      max-total-tokens: 131072
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 65536
+      enable-single-batch-overlap: true
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 4
+      ep-size: 4
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 3122380
+      chunked-prefill-size: 786432
+
+      # Request handling
+      max-running-requests: 67584
+      enable-single-batch-overlap: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      # CUDA graphs (extensive batch size list)
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x2048x4096x8192"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml
new file mode 100644
index 000000000..15d3b3930
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml
@@ -0,0 +1,182 @@
+name: "gb300-fp4-mid-curve-1k1k"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  decode_nodes: 8
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutlass"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.84
+      max-total-tokens: 131072
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 65536
+      enable-single-batch-overlap: true
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 4
+      ep-size: 4
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 2176
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 3122380
+      chunked-prefill-size: 786432
+
+      # Request handling
+      max-running-requests: 67584
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      # CUDA graphs (extensive batch size list)
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x2048x4096x8192"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml
new file mode 100644
index 000000000..d3c61231b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml
@@ -0,0 +1,119 @@
+name: "gb300-8k1k-fp4-low-latency-8k1k"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_node: 4
+
+backend:
+
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+
+  sglang_config:
+    prefill:
+      disaggregation-mode: "prefill"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600 
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768 
+      chunked-prefill-size: 24576 
+      cuda-graph-max-bs: 256
+      max-running-requests: 512
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      load-balance-method: "round_robin"
+      disaggregation-bootstrap-port: 30001
+      data-parallel-size: 1
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+
+
+    decode:
+      disaggregation-mode: "decode"
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      prefill-round-robin-balance: true
+      trust-remote-code: true
+      disable-radix-cache: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      stream-interval: 50
+      watchdog-timeout: 1000000
+      context-length: 9600
+      mem-fraction-static: 0.95
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      scheduler-recv-interval: 10
+      enable-symm-mem: true
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: "4x8x32x64"
+  req_rate: 300 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml
new file mode 100644
index 000000000..001311ed7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml
@@ -0,0 +1,179 @@
+name: "gb300-fp4-8k1k-max-tpt"
+
+dynamo:
+  version: 0.8.1
+      
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 10
+  decode_nodes: 8
+  prefill_workers: 10
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.95
+      max-total-tokens: 131072
+      max-prefill-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 1
+      ep-size: 1
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 524288
+      chunked-prefill-size: 24576
+
+      # Request handling
+      max-running-requests: 16384
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      cuda-graph-max-bs: 512
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: 700
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml
new file mode 100644
index 000000000..41043ed0d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml
@@ -0,0 +1,179 @@
+name: "gb300-fp4-8k1k-mid-curve"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx-sqsh
+
+model:
+  path: "dsr1"
+  container: "dynamo-sglang"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 6
+  decode_nodes: 12
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
+    SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_trtllm"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.95
+      max-total-tokens: 131072
+      max-prefill-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      max-running-requests: 30000
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+      enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 4
+      dp-size: 1
+      ep-size: 1
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+
+      # KV cache and attention
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+
+      # Quantization
+      quantization: "modelopt_fp4"
+      moe-runner-backend: "flashinfer_cutedsl"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+      disable-chunked-prefix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      watchdog-timeout: 1000000
+      context-length: 9600
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.83
+      max-total-tokens: 524288
+      chunked-prefill-size: 24576
+
+      # Request handling
+      max-running-requests: 16384
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      ep-num-redundant-experts: 32
+
+      cuda-graph-max-bs: 512
+      num-reserved-decode-tokens: 112
+
+      # Additional decode optimizations
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x2048x4096"
+  req_rate: 700
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml
new file mode 100644
index 000000000..51628e081
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml
@@ -0,0 +1,122 @@
+name: "gb300-1k1k-fp8-low-latency"
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_node: 4
+
+slurm:
+  time_limit: "02:00:00"
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.95
+      max-total-tokens: 8192 
+      chunked-prefill-size: 8192
+      max-prefill-tokens: 8192 
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      enable-flashinfer-allreduce-fusion: false # to save mem
+      enable-symm-mem: false # to save mem 
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      chunked-prefill-size: -1 # save mem
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      scheduler-recv-interval: 1  # save mem
+      enable-flashinfer-allreduce-fusion: false # to save mem 
+      enable-symm-mem: false # to save mem
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [4,8,16,32]
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml
new file mode 100644
index 000000000..c88a487b8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml
@@ -0,0 +1,171 @@
+# GB300 FP8 Max Throughput Configuration
+
+name: "gb300-1k1k-fp8-max"
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 8 
+      dp-size: 8 
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
+      cuda-graph-max-bs: 1024 
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [4096,7168,7680]
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml
new file mode 100644
index 000000000..ee6690285
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml
@@ -0,0 +1,170 @@
+# GB300 FP8 Mid Throughput Configuration
+name: "gb300-1k1k-fp8-mid"
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  prefill_workers: 2
+  decode_nodes: 8 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32 
+      dp-size: 32 
+      ep-size: 32 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1024,2048,4096,6144]
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml
new file mode 100644
index 000000000..71fd0f889
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml
@@ -0,0 +1,121 @@
+name: "gb300-8k1k-fp8-low-latency"
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1 
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+slurm:
+  time_limit: "02:00:00"
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 9300
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768 
+      chunked-prefill-size: 32768 
+      max-prefill-tokens: 32768 
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      enable-flashinfer-allreduce-fusion: false # to save mem
+      enable-symm-mem: false # to save mem 
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 9300
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      chunked-prefill-size: -1 # save mem
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      scheduler-recv-interval: 1  # save mem
+      enable-flashinfer-allreduce-fusion: false # to save mem 
+      enable-symm-mem: false # to save mem
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: [4,8]
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml
new file mode 100644
index 000000000..6d219cc1e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml
@@ -0,0 +1,171 @@
+# GB300 FP8 Max Throughput Configuration
+
+name: "gb300-8k1k-fp8-max"
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 12 
+  prefill_workers: 6
+  decode_nodes: 6 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 24 
+      dp-size: 24 
+      ep-size: 24 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 9300
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: [2048,4096]
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml
new file mode 100644
index 000000000..b085f50f8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml
@@ -0,0 +1,171 @@
+# GB300 FP8 Mid Throughput Configuration
+
+name: "gb300-8k1k-fp8-mid"
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-sglang"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 10 
+  prefill_workers: 5
+  decode_nodes: 8 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32 
+      dp-size: 32 
+      ep-size: 32 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 9300
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: [128,256,512,1024]
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
new file mode 100644
index 000000000..989fc47d1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
@@ -0,0 +1,114 @@
+name: "h100-fp8-1p1d-max-dep-mtp"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx-sqsh
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 4
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
new file mode 100644
index 000000000..0ce17e8a4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
@@ -0,0 +1,116 @@
+name: "h100-fp8-1p2d-max-tp-mtp"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx-sqsh
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 2
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      max-running-requests: 2
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml
new file mode 100644
index 000000000..c47b6c867
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml
@@ -0,0 +1,102 @@
+name: "h100-fp8-1p1d-max-dep"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+frontend:
+  nginx_container: nginx-sqsh
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 4
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml
new file mode 100644
index 000000000..1f7cf9985
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml
@@ -0,0 +1,102 @@
+name: "h100-fp8-1p2d-max-tp"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 2
+  gpus_per_node: 8
+
+frontend:
+  nginx_container: nginx-sqsh
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1 
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      max-running-requests: 2
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
new file mode 100644
index 000000000..4a0448658
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
@@ -0,0 +1,116 @@
+name: "h100-fp8-1p1d-max-dep-mtp"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+frontend:
+  nginx_container: nginx-sqsh
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 4
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
new file mode 100644
index 000000000..591556df7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
@@ -0,0 +1,116 @@
+name: "h100-fp8-1p1d-max-tp-mtp"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+frontend:
+  nginx_container: nginx-sqsh
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 2
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+      # MTP (Multi-Token Prediction)
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml
new file mode 100644
index 000000000..6c8a1c956
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml
@@ -0,0 +1,102 @@
+name: "h100-fp8-1p1d-max-dep"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+frontend:
+  nginx_container: nginx-sqsh
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 4
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml
new file mode 100644
index 000000000..196e781df
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml
@@ -0,0 +1,102 @@
+name: "h100-fp8-1p1d-max-tp"
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_node: 8
+
+frontend:
+  nginx_container: nginx-sqsh
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Prefill capacity
+      max-running-requests: 2
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.6
+      max-prefill-tokens: 2048
+      chunked-prefill-size: 2048
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 16
+      dp-size: 1
+      ep-size: 1
+      enable-dp-attention: false
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 1
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.9
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml
new file mode 100644
index 000000000..2c6539c93
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml
@@ -0,0 +1,121 @@
+name: "bs256-1p6d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      # used to be 512
+      max-running-requests: 64
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      # usd to be 0.75
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 65536
+      # used to be 262144
+      chunked-prefill-size: 65536
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512x1024x2048"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml
new file mode 100644
index 000000000..1932dc222
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml
@@ -0,0 +1,109 @@
+name: "bs256-1p6d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 512
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 65536
+      chunked-prefill-size: 262144
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512x1024x2048"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml
new file mode 100644
index 000000000..f2fc08020
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml
@@ -0,0 +1,118 @@
+name: "bs256-1p6d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 512
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.7
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  # concurrencies: "128x256x512"
+  concurrencies: "512x1024x2048"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml
new file mode 100644
index 000000000..05afea199
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml
@@ -0,0 +1,109 @@
+name: "bs256-1p6d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 512
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.7
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  # concurrencies: "128x256x512"
+  concurrencies: "512x1024x2048"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml
new file mode 100644
index 000000000..5d6e66ebb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml
@@ -0,0 +1,116 @@
+name: "low-latency-1p9d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 9
+  decode_workers: 9
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 256
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64x128x256"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml
new file mode 100644
index 000000000..e60102aae
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml
@@ -0,0 +1,106 @@
+name: "low-latency-1p9d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 9
+  decode_workers: 9
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 256
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64x128x256"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml
new file mode 100644
index 000000000..4d62e5a04
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml
@@ -0,0 +1,118 @@
+name: "bs128-1p1d-dep-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.85
+      max-running-requests: 192
+      cuda-graph-max-bs: 192
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128x256x512"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml
new file mode 100644
index 000000000..d131f6b02
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml
@@ -0,0 +1,109 @@
+name: "bs128-1p1d-dep-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.88
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128x256"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml
new file mode 100644
index 000000000..97ea49b9a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml
@@ -0,0 +1,116 @@
+name: "bs16-1p3d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml
new file mode 100644
index 000000000..576ff2a03
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml
@@ -0,0 +1,107 @@
+name: "bs16-1p3d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8x16x32"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml
new file mode 100644
index 000000000..d58d55b1b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml
@@ -0,0 +1,116 @@
+name: "bs4-1p7d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 7
+  decode_workers: 7
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-running-requests: 2
+      cuda-graph-max-bs: 2
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml
new file mode 100644
index 000000000..78ce3d5a1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml
@@ -0,0 +1,107 @@
+name: "bs4-1p7d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 7
+  decode_workers: 7
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 8
+      cuda-graph-max-bs: 8
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml
new file mode 100644
index 000000000..ed1232d16
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml
@@ -0,0 +1,125 @@
+name: "bs64-2p3d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      context-length: 72000
+      max-total-tokens: 128000 
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-running-requests: 16
+      cuda-graph-max-bs: 16
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128"
+  req_rate: "inf"
+
+# benchmark:
+#   type: "gpqa"
+#   num_examples: 198
+#   repeat: 4
+#   num_threads: 32
+#   max_tokens: 64000
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml
new file mode 100644
index 000000000..73aaacc30
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml
@@ -0,0 +1,115 @@
+name: "bs64-2p3d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      #context-length: 72000
+      # max-total-tokens: 128000 
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128"
+  req_rate: "inf"
+
+# benchmark:
+#   type: "gpqa"
+#   num_examples: 198
+#   repeat: 4
+#   num_threads: 32
+#   max_tokens: 64000
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml
new file mode 100644
index 000000000..5bd83fa5c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml
@@ -0,0 +1,117 @@
+name: "bs8-1p6d-h200-fp8-mtp"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      watchdog-timeout: 1000000
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 16
+      cuda-graph-max-bs: 16
+
+      # MTP settings
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 2
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2x4x8x16x32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml
new file mode 100644
index 000000000..c37c50eea
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml
@@ -0,0 +1,108 @@
+name: "bs8-1p6d-h200-fp8"
+
+model:
+  path: "dsr1"
+  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
+  precision: "fp8"
+
+frontend:
+  nginx_container: nginx
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      watchdog-timeout: 1000000
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 16
+      cuda-graph-max-bs: 16
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16"
+  req_rate: "inf"
+
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..ce3eff436
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,125 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# STP (no speculative decoding)
+# concurrency: 666
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..105b84bfd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch64_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=64
+# STP (no speculative decoding)
+# concurrency: 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: true
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..9fb194ddc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,217 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=768
+# STP (no speculative decoding)
+# Covers all dep8 concurrencies: 4301, 6452
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 768
+      max_num_tokens: 768
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+          - 264
+          - 272
+          - 280
+          - 288
+          - 296
+          - 304
+          - 312
+          - 320
+          - 328
+          - 336
+          - 344
+          - 352
+          - 360
+          - 368
+          - 376
+          - 384
+          - 392
+          - 400
+          - 408
+          - 416
+          - 424
+          - 432
+          - 440
+          - 448
+          - 456
+          - 464
+          - 472
+          - 480
+          - 488
+          - 496
+          - 504
+          - 512
+          - 520
+          - 528
+          - 536
+          - 544
+          - 552
+          - 560
+          - 568
+          - 576
+          - 584
+          - 592
+          - 600
+          - 608
+          - 616
+          - 624
+          - 632
+          - 640
+          - 648
+          - 656
+          - 664
+          - 672
+          - 680
+          - 688
+          - 696
+          - 704
+          - 712
+          - 720
+          - 728
+          - 736
+          - 744
+          - 752
+          - 760
+          - 768
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301x6452"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5639da411
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,138 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=128
+# STP (no speculative decoding)
+# Covers all gen4tep8 concurrencies: 4, 192, 360, 668
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x192x360x668"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..f9496feb6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,122 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=8
+# STP (no speculative decoding)
+# Covers all gen5tep4 concurrencies: 5, 15, 30, 55
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5x15x30x55"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..71b016c4b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,153 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep16_batch256_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=256
+# STP (no speculative decoding)
+# concurrency: 4301
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..52b75bb4e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,137 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx2dep4_gen1dep32_batch128_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=128
+# STP (no speculative decoding)
+# concurrency: 4301
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..8c1f0aa82
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP4/EP4, max_batch=32
+# Single concurrency point: 156
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 1 worker x TP4 = 4 GPUs = 1 node
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  # Decode: 4 workers x TP4 = 16 GPUs = 4 nodes
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "156"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..d4c5086b0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,123 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 4 decode workers, TP8/EP8, allreduce_strategy=MNNVL, max_batch=1
+# Single concurrency point: 4
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 1 worker x TP4 = 4 GPUs = 1 node
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  # Decode: 4 workers x TP8 = 32 GPUs = 8 nodes
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      allreduce_strategy: MNNVL
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..8f6ea063f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 5 decode workers, TP4/EP4, max_batch=16
+# Covers all concurrencies: 5, 15, 30, 60, 105
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 1 worker x TP4 = 4 GPUs = 1 node
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  # Decode: 5 workers x TP4 = 20 GPUs = 5 nodes
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      # max_batch_size=16 covers all concs: 5, 15, 30, 60, 105
+      # cuda_graph pre-compiles graphs for each batch size up to the max
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5x15x30x60x105"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..4bfaa0e2c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,124 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx2dep4_gen1dep16_batch16_eplb0_mtp0"
+
+# ctx: 2 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=16
+# concurrency: 333
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 2 workers x TP4 = 8 GPUs = 2 nodes
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..d7d51627c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,126 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx3dep4_gen1dep16_batch32_eplb0_mtp0"
+
+# ctx: 3 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# concurrency: 615
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 3 workers x TP4 = 12 GPUs = 3 nodes
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "615"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
new file mode 100644
index 000000000..e8df1179b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
@@ -0,0 +1,155 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0"
+
+# ctx: 5 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP8/EP8, enable_attention_dp=true, max_batch=256
+# Single concurrency point: 2151
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 5 workers x TP4 = 20 GPUs = 5 nodes
+  prefill_nodes: 5
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP8 = 8 GPUs = 2 nodes
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      # max_batch_size=256, cuda_graph pre-compiles graphs for all batch sizes up to 256
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+          - 136
+          - 144
+          - 152
+          - 160
+          - 168
+          - 176
+          - 184
+          - 192
+          - 200
+          - 208
+          - 216
+          - 224
+          - 232
+          - 240
+          - 248
+          - 256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2151"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..db1778920
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,138 @@
+name: "kimi_k25_nvfp4_ISL8K_OSL1K_ctx7dep4_gen1dep16_batch128_eplb0_mtp0"
+
+# ctx: 7 prefill workers, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=128
+# concurrency: 2253
+
+model:
+  path: "nvidia/Kimi-K2.5-NVFP4"
+  container: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  # Prefill: 7 workers x TP4 = 28 GPUs = 7 nodes
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  # Decode: 1 worker x TP16 = 16 GPUs = 4 nodes
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      trust_remote_code: true
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+          - 72
+          - 80
+          - 88
+          - 96
+          - 104
+          - 112
+          - 120
+          - 128
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
new file mode 100644
index 000000000..d4d9de835
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
@@ -0,0 +1,119 @@
+name: "ctx1_gen2_dep8_batch64_eplb0_mtp2"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 192
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 58
+        - 60
+        - 62
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1214"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
new file mode 100644
index 000000000..9532b9cc5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "ctx1_gen5_dep8_batch16_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 13
+        - 14
+        - 15
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "875"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..31bf5bf20
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,112 @@
+name: "ctx1_gen5_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "6"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..3a3309f56
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,126 @@
+name: "ctx1_gen5_tep8_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 8
+        - 9
+        - 10
+        - 16
+        - 17
+        - 18
+        - 29
+        - 30
+        - 31
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10x15x25x45x90x180"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
new file mode 100644
index 000000000..90ad2c657
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
@@ -0,0 +1,120 @@
+name: "ctx3_gen4_dep8_batch128_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 122
+        - 124
+        - 126
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4968"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
new file mode 100644
index 000000000..31adc6239
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
@@ -0,0 +1,126 @@
+name: "ctx3_gen5_dep4_batch512_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 3
+  gpus_per_decode: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 512
+      max_num_tokens: 1024
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 192
+        - 256
+        - 384
+        - 448
+        - 506
+        - 508
+        - 510
+        - 512
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10860"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..6c3e4bf80
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,117 @@
+name: "ctx1_gen1_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 384
+        - 448
+        - 508
+        - 510
+        - 512
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..56746330e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,114 @@
+name: "ctx1_gen2_dep8_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 122
+        - 124
+        - 126
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2192"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..0fde29f21
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,112 @@
+name: "ctx1_gen5_dep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 26
+        - 28
+        - 30
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1365"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..4612b7c2c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,106 @@
+name: "ctx1_gen5_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "6"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..53e833b75
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "ctx1_gen5_tep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 13
+        - 14
+        - 15
+        - 16
+        - 18
+        - 20
+        - 22
+        - 24
+        - 26
+        - 28
+        - 30
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10x15x25x45x90x180"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..47c2c6e22
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,116 @@
+name: "ctx1_gen6_tep8_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 6
+  decode_nodes: 6
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 56
+        - 58
+        - 60
+        - 62
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "450"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..a1ec4f38d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,116 @@
+name: "ctx1_gen1_dep8_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "90"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
new file mode 100644
index 000000000..48aad03b6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,123 @@
+name: "ctx1_gen3_tep8_batch16_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 13
+        - 14
+        - 15
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "66"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..559841f73
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,112 @@
+name: "ctx1_gen5_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "6"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..f9d9843f6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: "ctx1_gen5_tep8_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "10x15x30x60"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
new file mode 100644
index 000000000..7e06d12b5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
@@ -0,0 +1,120 @@
+name: "ctx3_gen1_dep8_batch64_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 48
+        - 56
+        - 60
+        - 62
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "548"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
new file mode 100644
index 000000000..96b4d97c5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
@@ -0,0 +1,124 @@
+name: "ctx5_gen1_dep8_batch192_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 3
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 192
+      max_num_tokens: 384
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 130
+        - 132
+        - 134
+        - 136
+        - 138
+        - 168
+        - 192
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1096x1691"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..98229c7bf
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: "ctx5_gen2_dep8_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 3
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 20
+        - 24
+        - 28
+        - 30
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "658"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..762987f6e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,107 @@
+name: "ctx1_gen5_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "6"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
new file mode 100644
index 000000000..a03114f95
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,120 @@
+name: "ctx1_gen5_tep8_batch8_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+        - 9
+        - 10
+        - 12
+        - 13
+        - 14
+        - 15
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "10x15x25x50x100"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..4dfe07604
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,115 @@
+name: "ctx2_gen5_tep8_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 58
+        - 60
+        - 62
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "370"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
new file mode 100644
index 000000000..23c2db5d8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
@@ -0,0 +1,118 @@
+name: "ctx4_gen1_dep8_batch192_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 192
+      max_num_tokens: 192
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 190
+        - 192
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1606"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..e94326803
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "ctx4_gen3_dep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 4
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 28
+        - 30
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "837"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..b3c9e1300
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,114 @@
+name: "ctx7_gen2_dep8_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 4
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 116
+        - 120
+        - 124
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2222"
+  req_rate: "inf"
+
+frontend:
+  nginx_container: "nginx-sqsh"
+  type: "dynamo"
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
new file mode 100644
index 000000000..8c7cf706d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen2_dep8_batch768_eplb0_mtp2_1600
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 768
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 768
+      max_num_tokens: 2304
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1600]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
new file mode 100644
index 000000000..dd06e8462
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen3_dep8_batch384_eplb0_mtp3_1184
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 384
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 384
+      max_num_tokens: 1536
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1184]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
new file mode 100644
index 000000000..d41d81458
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen4_dep8_batch256_eplb0_mtp3_1024
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 256
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 256
+      max_num_tokens: 1024
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1024]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
new file mode 100644
index 000000000..3b4193e44
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen7_dep8_batch128_eplb0_mtp3_896
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 7
+  decode_nodes: 7
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [896]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
new file mode 100644
index 000000000..de08fe729
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen8_tp8_batch1_eplb0_mtp3_8
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [8]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
new file mode 100644
index 000000000..0b67948c3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen8_tp8_batch32_eplb0_mtp3_256
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 32
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [256]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
new file mode 100644
index 000000000..a79351e20
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen8_tp8_batch4_eplb0_mtp3_32
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 4
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [32]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
new file mode 100644
index 000000000..1814ff355
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen8_tp8_batch8_eplb0_mtp3_64
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 8
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [64]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
new file mode 100644
index 000000000..2e0ac949f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen1_dep8_batch512_eplb0_mtp0_4096
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 512
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 512
+      max_num_tokens: 4096
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 40
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [4096]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
new file mode 100644
index 000000000..47008c9f0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen3_tp8_batch1024_eplb0_mtp0_128
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1024
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1024
+      max_num_tokens: 4096
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [128]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
new file mode 100644
index 000000000..aa2d8c6f2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen3_tp8_batch1024_eplb0_mtp0_32
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 12
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 12
+      max_num_tokens: 12
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [32]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
new file mode 100644
index 000000000..b9829e22f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen3_tp8_batch1024_eplb0_mtp0_4
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2176
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [4]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
new file mode 100644
index 000000000..56df5bad2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen5_dep8_batch48_eplb0_mtp0_1920
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 48
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 48
+      max_num_tokens: 4096
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1920]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
new file mode 100644
index 000000000..a412a6419
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
@@ -0,0 +1,115 @@
+name: ctx2_gen5_dep8_batch128_eplb0_mtp0_5152
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 8
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1152
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 1152
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 4096
+      max_seq_len: 2176
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [5152]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
new file mode 100644
index 000000000..2ccfffba7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen2_tp8_batch32_eplb0_mtp3_8
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        enable_balance: true
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 4
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [8]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
new file mode 100644
index 000000000..a9ad0a7d9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen4_tp8_batch16_eplb0_mtp3_64
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        enable_balance: true
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [64]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
new file mode 100644
index 000000000..38b12e6c0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen6_tp8_batch8_eplb0_mtp3_48
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 6
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        enable_balance: true
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 8
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [48]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
new file mode 100644
index 000000000..3b38311b7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen6_tp8_batch8_eplb0_mtp3_8
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 6
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        enable_balance: true
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [8]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
new file mode 100644
index 000000000..378123831
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
@@ -0,0 +1,125 @@
+name: ctx2_gen1_dep8_batch32_eplb0_mtp3_288
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        batching_wait_iters: 0
+        enable_balance: true
+        timeout_iters: 60
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 32
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 32
+      max_num_tokens: 1024
+      max_seq_len: 9344
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [288]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
new file mode 100644
index 000000000..a26eaf4f1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
@@ -0,0 +1,125 @@
+name: ctx2_gen3_dep8_batch8_eplb0_mtp3_224
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        batching_wait_iters: 0
+        enable_balance: true
+        timeout_iters: 60
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 8
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9344
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [224]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
new file mode 100644
index 000000000..3c659d4dc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
@@ -0,0 +1,125 @@
+name: ctx4_gen1_dep8_batch128_eplb0_mtp2_1088
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 4
+  prefill_workers: 4
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      attention_dp_config:
+        batching_wait_iters: 0
+        enable_balance: true
+        timeout_iters: 60
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 3072
+      max_seq_len: 9344
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [1088]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
new file mode 100644
index 000000000..6c383e60e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen1_dep8_batch128_eplb0_mtp0_128
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [128]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
new file mode 100644
index 000000000..7821ab79e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen1_dep8_batch256_eplb0_mtp0_256
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 256
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [256]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
new file mode 100644
index 000000000..0f2fdd949
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
@@ -0,0 +1,117 @@
+name: ctx1_gen1_tp8_batch1_eplb0_mtp0_1
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: true
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [1]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
new file mode 100644
index 000000000..305c15124
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen2_dep8_batch64_eplb0_mtp0_128
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [128]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
new file mode 100644
index 000000000..3c64aacf5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
@@ -0,0 +1,116 @@
+name: ctx1_gen4_tp8_batch32_eplb0_mtp0_128
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 32
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 32
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [128]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
new file mode 100644
index 000000000..751bdd585
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
@@ -0,0 +1,116 @@
+name: ctx1_gen4_tp8_batch32_eplb0_mtp0_32
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 8
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [32]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
new file mode 100644
index 000000000..cb4c4d8a3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
@@ -0,0 +1,116 @@
+name: ctx1_gen6_tp8_batch16_eplb0_mtp0_96
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 6
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [96]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
new file mode 100644
index 000000000..db804a6b6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
@@ -0,0 +1,115 @@
+name: ctx2_gen1_dep8_batch640_eplb0_mtp0_640
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_RNDV_SCHEME: "put_zcopy"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.2
+      max_batch_size: 1
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: DEFAULT
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 640
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 640
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [640]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
new file mode 100644
index 000000000..36b365a7d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
@@ -0,0 +1,127 @@
+name: "ctx1_gen1_dep8_batch64_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "654"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
new file mode 100644
index 000000000..f2cd900c9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,125 @@
+name: "ctx1_gen2_dep8_batch16_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "271"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..31bae1596
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,123 @@
+name: "ctx1_gen5_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "11"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..eeb43290a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,139 @@
+name: "ctx1_gen5_tep8_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 8
+        - 10
+        - 11
+        - 12
+        - 16
+        - 18
+        - 20
+        - 22
+        - 23
+        - 24
+        - 28
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "10x20x25x60x120x200"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
new file mode 100644
index 000000000..7f8b9ae4a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
@@ -0,0 +1,129 @@
+name: "ctx2_gen1_dep8_batch256_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2342"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
new file mode 100644
index 000000000..98d8ab04d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
@@ -0,0 +1,130 @@
+name: "ctx5_gen2_dep8_batch512_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 5
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 512
+      max_num_tokens: 1024
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8609"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
new file mode 100644
index 000000000..a81e980ec
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
@@ -0,0 +1,131 @@
+name: "ctx5_gen2_dep8_batch768_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 5
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 768
+      max_num_tokens: 1536
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+        - 768
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "12926"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..13978a422
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,121 @@
+name: "ctx1_gen2_dep8_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1176"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5885277d0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,117 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "6"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
new file mode 100644
index 000000000..9d73c7308
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
@@ -0,0 +1,121 @@
+name: "ctx1_gen5_tep4_batch4_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 3
+  gpus_per_decode: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5x10x15x25"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..92b99de35
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,136 @@
+name: "ctx1_gen5_tep8_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 8
+        - 10
+        - 11
+        - 12
+        - 16
+        - 18
+        - 20
+        - 22
+        - 27
+        - 32
+        - 35
+        - 39
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "60x110x195x395"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..3113744c9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,124 @@
+name: "ctx2_gen1_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4405"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
new file mode 100644
index 000000000..d74782639
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "ctx3_gen1_dep8_batch1024_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1024
+      max_num_tokens: 1024
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+        - 768
+        - 832
+        - 896
+        - 960
+        - 1024
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5088b566c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,123 @@
+name: "ctx3_gen2_dep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 8
+      max_num_tokens: 10240
+      max_seq_len: 1044
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2068
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4611"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
new file mode 100644
index 000000000..c24f57918
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
@@ -0,0 +1,129 @@
+name: "ctx10_gen1_dep8_batch256_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 3
+  prefill_workers: 10
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2198"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..7e2ab395a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,127 @@
+name: "ctx1_gen4_tep4_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      gpus_per_node: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "52"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..83c7af6ad
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,123 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..723029b8d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,126 @@
+name: "ctx1_gen4_tep8_batch4_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
new file mode 100644
index 000000000..67e9fc568
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,125 @@
+name: "ctx3_gen1_dep8_batch16_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "181"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
new file mode 100644
index 000000000..b0494f78f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
@@ -0,0 +1,128 @@
+name: "ctx9_gen1_dep8_batch128_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 3
+  prefill_workers: 9
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1197"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5bc38c22a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,123 @@
+name: "ctx1_gen3_tep4_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 2
+  gpus_per_decode: 4
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "105"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..002aa9e27
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,121 @@
+name: "ctx1_gen3_tep8_batch16_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "63"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5e8d96a80
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,130 @@
+name: "ctx1_gen3_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+        - 768
+        - 1024
+        - 2048
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
new file mode 100644
index 000000000..df7612f99
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,118 @@
+name: "ctx1_gen4_tep4_batch2_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 2
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "12"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..b791d44b8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,120 @@
+name: "ctx5_gen2_dep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 5
+  gpus_per_prefill: 2
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "589"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..09b89137c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,128 @@
+name: "ctx6_gen1_dep8_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 6
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+        - 768
+        - 1024
+        - 2048
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1093"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..0ca0d7692
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,128 @@
+name: "ctx8_gen1_dep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 8
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16896
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 32
+        - 64
+        - 128
+        - 256
+        - 512
+        - 768
+        - 1024
+        - 2048
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8448
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
new file mode 100644
index 000000000..cfa58f2a3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
@@ -0,0 +1,133 @@
+name: ctx1_gen1_dp8_batch256_eplb0_mtp1_3072
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 256
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 256
+      max_num_tokens: 2100
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [3072]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
new file mode 100644
index 000000000..866ccbb8e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
@@ -0,0 +1,133 @@
+name: ctx1_gen2_dep8_batch128_eplb0_mtp1_2560
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 1100
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [2560]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
new file mode 100644
index 000000000..4e7600a2c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
@@ -0,0 +1,133 @@
+name: ctx1_gen5_dep8_batch16_eplb0_mtp2_720
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 180
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [720]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
new file mode 100644
index 000000000..a00639e26
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
@@ -0,0 +1,134 @@
+name: ctx1_gen8_tp8_batch16_eplb0_mtp3_160
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 384
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [160]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
new file mode 100644
index 000000000..62ae3984f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
@@ -0,0 +1,134 @@
+name: ctx1_gen8_tp8_batch1_eplb0_mtp3_10
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [10]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
new file mode 100644
index 000000000..957676992
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
@@ -0,0 +1,133 @@
+name: ctx3_gen2_dp8_batch512_eplb0_mtp1_11264
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 512
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 512
+      max_num_tokens: 4200
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [11264]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
new file mode 100644
index 000000000..f41079a54
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
@@ -0,0 +1,127 @@
+name: ctx1_gen1_dep8_batch256_eplb0_mtp0_2112
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 256
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 256
+      max_num_tokens: 2048
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [2112]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
new file mode 100644
index 000000000..7746b638c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
@@ -0,0 +1,127 @@
+name: ctx1_gen2_dp8_batch128_eplb0_mtp0_3072
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 1024
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [3072]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
new file mode 100644
index 000000000..bdaef8f3e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
@@ -0,0 +1,127 @@
+name: ctx1_gen3_dp8_batch48_eplb0_mtp0_1280
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 3
+  decode_nodes: 3
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 48
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 48
+      max_num_tokens: 384
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1280]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
new file mode 100644
index 000000000..f469bf3bc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
@@ -0,0 +1,128 @@
+name: ctx1_gen8_tp8_batch64_eplb0_mtp0_12
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [10]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
new file mode 100644
index 000000000..b3b2d8740
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
@@ -0,0 +1,128 @@
+name: ctx1_gen8_tp8_batch64_eplb0_mtp0_128
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [128]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
new file mode 100644
index 000000000..36476736b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
@@ -0,0 +1,128 @@
+name: ctx1_gen8_tp8_batch64_eplb0_mtp0_384
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [384]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
new file mode 100644
index 000000000..c9d131239
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
@@ -0,0 +1,127 @@
+name: ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 1280
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 1280
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1024
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1024
+      max_num_tokens: 8192
+      max_seq_len: 2400
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [16384]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
new file mode 100644
index 000000000..7e806469c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
@@ -0,0 +1,133 @@
+name: ctx1_gen1_dp8_batch8_eplb0_mtp3_72
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 8
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 8
+      max_num_tokens: 90
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [72]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
new file mode 100644
index 000000000..c203b724a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
@@ -0,0 +1,134 @@
+name: ctx1_gen2_tp8_batch16_eplb0_mtp3_40
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 80
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [40]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
new file mode 100644
index 000000000..48773bf14
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
@@ -0,0 +1,134 @@
+name: ctx1_gen4_tp8_batch1_eplb0_mtp3_8
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [5]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
new file mode 100644
index 000000000..bba0d5a65
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
@@ -0,0 +1,134 @@
+name: ctx1_gen4_tp8_batch4_eplb0_mtp3_20
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 4
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 4
+      max_num_tokens: 20
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [20]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
new file mode 100644
index 000000000..9511ede04
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
@@ -0,0 +1,133 @@
+name: ctx2_gen1_dp8_batch16_eplb0_mtp3_144
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 180
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [144]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
new file mode 100644
index 000000000..7513770d8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
@@ -0,0 +1,133 @@
+name: ctx4_gen1_dp8_batch64_eplb0_mtp2_512
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 4
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 650
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [512]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
new file mode 100644
index 000000000..2852df6c3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
@@ -0,0 +1,128 @@
+name: ctx1_gen4_tp8_batch16_eplb0_mtp0_64
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 16
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 16
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [64]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
new file mode 100644
index 000000000..68ae8f4dc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
@@ -0,0 +1,128 @@
+name: ctx1_gen8_tp8_batch2_eplb0_mtp0_16
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 8
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 1
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [10]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
new file mode 100644
index 000000000..1c2977396
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
@@ -0,0 +1,127 @@
+name: ctx2_gen1_dp8_batch32_eplb0_mtp0_256
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 1
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 32
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 32
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [256]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
new file mode 100644
index 000000000..343b25905
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
@@ -0,0 +1,127 @@
+name: ctx3_gen1_dp8_batch64_eplb0_mtp0_512
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [512]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
new file mode 100644
index 000000000..5aa5546ab
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
@@ -0,0 +1,128 @@
+name: ctx3_gen5_tp8_batch64_eplb0_mtp0_256
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 5
+  decode_nodes: 5
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 64
+      disable_overlap_scheduler: false
+      enable_attention_dp: false
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [256]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
new file mode 100644
index 000000000..df8c2831c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
@@ -0,0 +1,127 @@
+name: ctx5_gen1_dp8_batch128_eplb0_mtp0_1075
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 3
+  prefill_workers: 5
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 128
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [1075]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
new file mode 100644
index 000000000..9b0df56e9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
@@ -0,0 +1,127 @@
+name: ctx7_gen1_dep8_batch384_eplb0_mtp0_3072
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  prefill_nodes: 4
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 1
+  gpus_per_decode: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  decode_environment:
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    OMPI_MCA_coll_ucc_enable: "0"
+    TLLM_ALL_RANK_LOG: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    UCX_MAX_RMA_RAILS: "1"
+    UCX_MAX_RNDV_RAILS: "1"
+    UCX_RNDV_SCHEME: "put_zcopy"
+    OMPI_MCA_btl: "tcp,self"
+    OMPI_MCA_pml: "ob1"
+    TRTLLM_UCX_INTERFACE: "mlx5_0:1,mlx5_1:1,mlx5_10:1,mlx5_11:1,mlx5_16:1,mlx5_17:1,mlx5_20:1,mlx5_21:1,mlx5_22:1,mlx5_23:1,mlx5_4:1,mlx5_5:1,mlx5_8:1,mlx5_9:1,mlx5_2:1,mlx5_3:1"
+
+  trtllm_config:
+    prefill:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: false
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      max_batch_size: 8
+      max_num_tokens: 8320
+      max_seq_len: 8320
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 1
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: AUTO
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8320
+      cuda_graph_config:
+        enable_padding: true
+        max_batch_size: 384
+      disable_overlap_scheduler: false
+      enable_attention_dp: true
+      enable_iter_perf_stats: false
+      enable_iter_req_stats: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 384
+      max_num_tokens: 512
+      max_seq_len: 9344
+      moe_config:
+        backend: TRTLLM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 20
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: [3072]
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
\ No newline at end of file
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..a8f90e9bd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "ctx1_gen1_dep32_batch4_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "180"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..be4f29045
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,121 @@
+name: "ctx1_gen4_tep8_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x12x24x48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
new file mode 100644
index 000000000..5dd8a302b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
@@ -0,0 +1,152 @@
+name: "ctx2_gen1_dep16_batch256_eplb256_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
new file mode 100644
index 000000000..08fc612ec
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
@@ -0,0 +1,128 @@
+name: "ctx3_gen1_dep32_batch64_eplb288_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 288
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
new file mode 100644
index 000000000..44a05c484
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
@@ -0,0 +1,213 @@
+name: "ctx3_gen5_dep4_batch768_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 768
+      max_num_tokens: 1536
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        - 520
+        - 528
+        - 536
+        - 544
+        - 552
+        - 560
+        - 568
+        - 576
+        - 584
+        - 592
+        - 600
+        - 608
+        - 616
+        - 624
+        - 632
+        - 640
+        - 648
+        - 656
+        - 664
+        - 672
+        - 680
+        - 688
+        - 696
+        - 704
+        - 712
+        - 720
+        - 728
+        - 736
+        - 744
+        - 752
+        - 760
+        - 768
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "16130"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..c353c3df0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,113 @@
+name: "ctx1_gen1_dep32_batch16_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..a62b540d9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,175 @@
+name: "ctx1_gen1_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 1
+  decode_nodes: 2
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
new file mode 100644
index 000000000..d56eba13c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
@@ -0,0 +1,207 @@
+name: "ctx1_gen2_dep4_batch768_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 768
+      max_num_tokens: 768
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        - 520
+        - 528
+        - 536
+        - 544
+        - 552
+        - 560
+        - 568
+        - 576
+        - 584
+        - 592
+        - 600
+        - 608
+        - 616
+        - 624
+        - 632
+        - 640
+        - 648
+        - 656
+        - 664
+        - 672
+        - 680
+        - 688
+        - 696
+        - 704
+        - 712
+        - 720
+        - 728
+        - 736
+        - 744
+        - 752
+        - 760
+        - 768
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "6144"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..94a45661b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,110 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..a93c86f82
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,125 @@
+name: "ctx1_gen4_tep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 8
+        - 9
+        - 10
+        - 11
+        - 12
+        - 16
+        - 22
+        - 23
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "12x24x48x96x192"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
new file mode 100644
index 000000000..9aa57eb46
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
@@ -0,0 +1,146 @@
+name: "ctx2_gen1_dep16_batch256_eplb256_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..3501708c2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,119 @@
+name: "ctx2_gen1_dep32_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
new file mode 100644
index 000000000..0a88341a1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
@@ -0,0 +1,152 @@
+name: "ctx11_gen1_dep16_batch256_eplb256_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 11
+  prefill_workers: 11
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..b4dd6005d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,123 @@
+name: "ctx1_gen4_tep8_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 5
+        - 6
+        - 7
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x12x24x48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..9374538f8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "ctx3_gen1_dep32_batch4_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "180"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
new file mode 100644
index 000000000..a62e4f24f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
@@ -0,0 +1,128 @@
+name: "ctx7_gen1_dep16_batch64_eplb256_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 7
+  prefill_workers: 7
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
new file mode 100644
index 000000000..ee3082fe5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: "ctx8_gen1_dep32_batch16_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 8
+  prefill_workers: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
new file mode 100644
index 000000000..4df408491
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
@@ -0,0 +1,146 @@
+name: "ctx10_gen1_dep16_batch256_eplb256_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 10
+  prefill_workers: 10
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..4b603ad67
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,121 @@
+name: "ctx1_gen4_tep8_batch16_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 6
+        - 8
+        - 9
+        - 10
+        - 11
+        - 14
+        - 15
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "12x44x76"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..1ee953844
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,112 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
new file mode 100644
index 000000000..b08791f00
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,112 @@
+name: "ctx2_gen1_dep32_batch8_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..7f4e9594e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,115 @@
+name: "ctx7_gen1_dep32_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 7
+  prefill_workers: 7
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..059688716
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "ctx8_gen1_dep16_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 8
+  prefill_workers: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
new file mode 100644
index 000000000..ba7f2ff21
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
@@ -0,0 +1,127 @@
+name: ctx1_gen1_dep16_batch64_eplb0_mtp1_1229
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['1229']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
new file mode 100644
index 000000000..218b85744
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen1_dep32_batch16_eplb0_mtp3_615
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['615']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
new file mode 100644
index 000000000..fe49d8959
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
@@ -0,0 +1,151 @@
+name: ctx1_gen1_dep8_batch256_eplb0_mtp1_2151
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['2151']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
new file mode 100644
index 000000000..25847ed23
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
@@ -0,0 +1,183 @@
+name: ctx1_gen1_dep8_batch512_eplb0_mtp1_4301
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 512
+      max_num_tokens: 1024
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['4301']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
new file mode 100644
index 000000000..62d4be838
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
@@ -0,0 +1,120 @@
+name: ctx1_gen3_tep8_batch2_eplb0_mtp3_9
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 2
+      max_num_tokens: 8
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['9']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
new file mode 100644
index 000000000..47f21d46b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
@@ -0,0 +1,120 @@
+name: ctx1_gen3_tep8_batch4_eplb0_mtp3_18
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['18']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
new file mode 100644
index 000000000..ecb7c92cd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
@@ -0,0 +1,121 @@
+name: ctx1_gen3_tep8_batch8_eplb0_mtp3_36
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['36']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
new file mode 100644
index 000000000..47b869af5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
@@ -0,0 +1,129 @@
+name: ctx1_gen1_dep16_batch128_eplb0_mtp0_2151
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['2151']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
new file mode 100644
index 000000000..d1e3cae50
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
@@ -0,0 +1,117 @@
+name: ctx1_gen1_dep32_batch32_eplb0_mtp0_1127
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['1127']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
new file mode 100644
index 000000000..c48edbd5f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
@@ -0,0 +1,114 @@
+name: ctx1_gen1_dep32_batch8_eplb0_mtp0_256
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['256']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
new file mode 100644
index 000000000..08139cf82
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
@@ -0,0 +1,177 @@
+name: ctx1_gen1_dep8_batch512_eplb0_mtp0_4301
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['4301']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
new file mode 100644
index 000000000..14b33599c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
@@ -0,0 +1,209 @@
+name: ctx1_gen1_dep8_batch768_eplb0_mtp0_6144
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        - 520
+        - 528
+        - 536
+        - 544
+        - 552
+        - 560
+        - 568
+        - 576
+        - 584
+        - 592
+        - 600
+        - 608
+        - 616
+        - 624
+        - 632
+        - 640
+        - 648
+        - 656
+        - 664
+        - 672
+        - 680
+        - 688
+        - 696
+        - 704
+        - 712
+        - 720
+        - 728
+        - 736
+        - 744
+        - 752
+        - 760
+        - 768
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 768
+      max_num_tokens: 768
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['6144']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
new file mode 100644
index 000000000..2b9250430
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
@@ -0,0 +1,114 @@
+name: ctx1_gen3_tep8_batch1_eplb0_mtp0_3
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['3']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
new file mode 100644
index 000000000..160f4c6ca
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
@@ -0,0 +1,115 @@
+name: ctx1_gen3_tep8_batch8_eplb0_mtp0_27
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.5
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['27']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
new file mode 100644
index 000000000..8f305ced0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
@@ -0,0 +1,120 @@
+name: ctx1_gen3_tep8_batch2_eplb0_mtp3_6
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 2
+      max_num_tokens: 8
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['6']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
new file mode 100644
index 000000000..bea950ac7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
@@ -0,0 +1,120 @@
+name: ctx1_gen3_tep8_batch4_eplb0_mtp3_15
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['15']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
new file mode 100644
index 000000000..fbf861990
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
@@ -0,0 +1,119 @@
+name: ctx2_gen1_dep32_batch2_eplb0_mtp3_90
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 4
+  prefill_workers: 2
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 2
+      max_num_tokens: 8
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['90']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
new file mode 100644
index 000000000..ea8a7d013
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
@@ -0,0 +1,121 @@
+name: ctx3_gen1_dep16_batch16_eplb0_mtp3_333
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  prefill_workers: 3
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['333']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
new file mode 100644
index 000000000..2ad2e727d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
@@ -0,0 +1,127 @@
+name: ctx3_gen1_dep8_batch64_eplb0_mtp3_666
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  prefill_workers: 3
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['666']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
new file mode 100644
index 000000000..95bf6192f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
@@ -0,0 +1,120 @@
+name: ctx4_gen1_dep32_batch8_eplb0_mtp3_333
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 8
+  prefill_workers: 4
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['333']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
new file mode 100644
index 000000000..35da2b70f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
@@ -0,0 +1,123 @@
+name: ctx5_gen1_dep16_batch32_eplb0_mtp3_666
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 10
+  prefill_workers: 5
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['666']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
new file mode 100644
index 000000000..178a3b7df
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
@@ -0,0 +1,116 @@
+name: ctx1_gen3_tep8_batch16_eplb0_mtp0_63
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['63']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
new file mode 100644
index 000000000..f33813fd9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
@@ -0,0 +1,114 @@
+name: ctx1_gen3_tep8_batch1_eplb0_mtp0_6
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['6']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
new file mode 100644
index 000000000..98aee313b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
@@ -0,0 +1,114 @@
+name: ctx1_gen3_tep8_batch4_eplb0_mtp0_18
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  prefill_workers: 1
+  gpus_per_prefill: 8
+
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['18']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
new file mode 100644
index 000000000..816065639
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
@@ -0,0 +1,114 @@
+name: ctx2_gen1_dep32_batch8_eplb0_mtp0_333
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 4
+  prefill_workers: 2
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['333']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
new file mode 100644
index 000000000..f7d87c1b3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
@@ -0,0 +1,117 @@
+name: ctx3_gen1_dep16_batch32_eplb0_mtp0_615
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  prefill_workers: 3
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['615']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
new file mode 100644
index 000000000..27a19e5b8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
@@ -0,0 +1,115 @@
+name: ctx4_gen1_dep32_batch16_eplb0_mtp0_666
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 8
+  prefill_workers: 4
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['666']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
new file mode 100644
index 000000000..634f07cdb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
@@ -0,0 +1,121 @@
+name: ctx5_gen1_dep16_batch64_eplb0_mtp0_1229
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 10
+  prefill_workers: 5
+  gpus_per_prefill: 8
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 8
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['1229']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  nginx_container: "nginx-sqsh"
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
new file mode 100644
index 000000000..b4434cdda
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
@@ -0,0 +1,121 @@
+name: "ctx1_gen1_dep32_batch8_eplb0_mtp"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "333"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
new file mode 100644
index 000000000..e264a1796
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
@@ -0,0 +1,216 @@
+name: "ctx1_gen1_dep4_batch768_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 768
+      max_num_tokens: 1536
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        - 520
+        - 528
+        - 536
+        - 544
+        - 552
+        - 560
+        - 568
+        - 576
+        - 584
+        - 592
+        - 600
+        - 608
+        - 616
+        - 624
+        - 632
+        - 640
+        - 648
+        - 656
+        - 664
+        - 672
+        - 680
+        - 688
+        - 696
+        - 704
+        - 712
+        - 720
+        - 728
+        - 736
+        - 744
+        - 752
+        - 760
+        - 768
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "3226"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..67c672ffb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..aab184727
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,124 @@
+name: "ctx1_gen4_tep8_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 6
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8x12x24x48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
new file mode 100644
index 000000000..58cbacdf4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
@@ -0,0 +1,139 @@
+name: "ctx3_gen1_dep16_batch128_eplb256_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
new file mode 100644
index 000000000..698989630
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
@@ -0,0 +1,127 @@
+name: "ctx3_gen1_dep32_batch32_eplb288_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 288
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..642aa6c43
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,113 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..44774b6bc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: "ctx1_gen4_tep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+        - 6
+        - 8
+        - 10
+        - 11
+        - 12
+        - 16
+        - 18
+        - 20
+        - 24
+        - 28
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "12x48x96x192"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..ffc2850fb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,117 @@
+name: "ctx2_gen1_dep32_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
new file mode 100644
index 000000000..28e148d02
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
@@ -0,0 +1,241 @@
+name: "ctx2_gen1_dep8_batch1024_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 2
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1024
+      max_num_tokens: 1024
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        - 520
+        - 528
+        - 536
+        - 544
+        - 552
+        - 560
+        - 568
+        - 576
+        - 584
+        - 592
+        - 600
+        - 608
+        - 616
+        - 624
+        - 632
+        - 640
+        - 648
+        - 656
+        - 664
+        - 672
+        - 680
+        - 688
+        - 696
+        - 704
+        - 712
+        - 720
+        - 728
+        - 736
+        - 744
+        - 752
+        - 760
+        - 768
+        - 776
+        - 784
+        - 792
+        - 800
+        - 808
+        - 816
+        - 824
+        - 832
+        - 840
+        - 848
+        - 856
+        - 864
+        - 872
+        - 880
+        - 888
+        - 896
+        - 904
+        - 912
+        - 920
+        - 928
+        - 936
+        - 944
+        - 952
+        - 960
+        - 968
+        - 976
+        - 984
+        - 992
+        - 1000
+        - 1008
+        - 1016
+        - 1024
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
new file mode 100644
index 000000000..4d4ffe594
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
@@ -0,0 +1,149 @@
+name: "ctx3_gen1_dep16_batch256_eplb256_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4301"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..de841c92c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,122 @@
+name: "ctx3_gen1_dep32_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 3
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 2048
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..7bf2a9332
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,123 @@
+name: "ctx10_gen1_dep16_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 5
+  prefill_workers: 10
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
new file mode 100644
index 000000000..09710a97d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
@@ -0,0 +1,151 @@
+name: "ctx10_gen1_dep8_batch256_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 5
+  prefill_workers: 10
+
+  decode_workers: 1
+  decode_nodes: 2
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
new file mode 100644
index 000000000..61988358c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
@@ -0,0 +1,131 @@
+name: "ctx13_gen1_dep16_batch64_eplb256_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 7
+  prefill_workers: 13
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+        load_balancer:
+          num_slots: 256
+          layer_updates_per_iter: 1
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1127"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..f07f607ea
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,122 @@
+name: "ctx1_gen3_tep8_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 6
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "33"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..be9842323
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..5d45c06d3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,122 @@
+name: "ctx1_gen4_tep8_batch4_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 3
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "12x24"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..c0c4f66e7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,119 @@
+name: "ctx4_gen1_dep32_batch4_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "180"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..e719310a4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,120 @@
+name: "ctx8_gen1_dep32_batch8_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  prefill_workers: 8
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "308"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..6b6f4a36e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,146 @@
+name: "ctx11_gen3_dep4_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 6
+  prefill_workers: 11
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "3228"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..42523722e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,129 @@
+name: "ctx14_gen1_dep16_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 7
+  prefill_workers: 14
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2253"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..34678b650
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,117 @@
+name: "ctx1_gen3_tep8_batch16_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 3
+  decode_nodes: 6
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "72"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..158dd4ed9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,113 @@
+name: "ctx1_gen4_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
new file mode 100644
index 000000000..f2f18332c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,114 @@
+name: "ctx1_gen4_tep8_batch2_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 4
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 2
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      allreduce_strategy: MNNVL
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "12"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
new file mode 100644
index 000000000..f380710f8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
@@ -0,0 +1,114 @@
+name: "ctx1_gen5_tep4_batch4_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "5x15x30"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..8dbb94ea5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,116 @@
+name: "ctx7_gen1_dep32_batch16_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  prefill_workers: 7
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..eba48a69c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,122 @@
+name: "ctx9_gen1_dep16_batch64_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "dynamo-trtllm"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 5
+  prefill_workers: 9
+  gpus_per_prefill: 2
+
+  decode_workers: 1
+  decode_nodes: 4
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_ENABLE_PDL: "1"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      tensor_parallel_size: 2
+      moe_expert_parallel_size: 2
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      moe_config:
+        backend: TRTLLM
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+        dtype: fp8
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      nvfp4_gemm_config:
+        allowed_backends:
+        - cutlass
+        - cublaslt
+        - cutedsl
+        - cuda_core
+      cache_transceiver_config:
+        max_tokens_in_buffer: 16384
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
+
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
new file mode 100644
index 000000000..fd4c842d5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
@@ -0,0 +1,126 @@
+name: ctx1_gen1_dep16_batch32_eplb0_mtp3_666
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['666']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
new file mode 100644
index 000000000..24cc7fcb2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
@@ -0,0 +1,122 @@
+name: ctx1_gen1_dep32_batch4_eplb0_mtp3_180
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['180']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
new file mode 100644
index 000000000..dd886c1c6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen4_tep8_batch1_eplb0_mtp3_8
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['8']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
new file mode 100644
index 000000000..6625fde5d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen4_tep8_batch4_eplb0_mtp3_24
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['24']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
new file mode 100644
index 000000000..14b8c83ec
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
@@ -0,0 +1,138 @@
+name: ctx2_gen1_dep16_batch128_eplb0_mtp1_2253
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['2253']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
new file mode 100644
index 000000000..30335f8e4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
@@ -0,0 +1,124 @@
+name: ctx2_gen1_dep32_batch16_eplb0_mtp3_564
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['564']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
new file mode 100644
index 000000000..5985d197c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
@@ -0,0 +1,186 @@
+name: ctx3_gen2_dep8_batch512_eplb0_mtp1_8192
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 512
+      max_num_tokens: 1024
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['8192']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
new file mode 100644
index 000000000..5d74bf4f0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
@@ -0,0 +1,119 @@
+name: ctx1_gen4_tep8_batch16_eplb0_mtp0_84
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['84']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
new file mode 100644
index 000000000..9b51b74ce
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
@@ -0,0 +1,117 @@
+name: ctx1_gen4_tep8_batch1_eplb0_mtp0_4
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['4']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
new file mode 100644
index 000000000..bc0a9ad4a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
@@ -0,0 +1,117 @@
+name: ctx1_gen4_tep8_batch4_eplb0_mtp0_24
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 2088
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['24']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
new file mode 100644
index 000000000..126e651e1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
@@ -0,0 +1,132 @@
+name: ctx2_gen1_dep16_batch128_eplb0_mtp0_2253
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['2253']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
new file mode 100644
index 000000000..f66062760
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
@@ -0,0 +1,120 @@
+name: ctx2_gen1_dep32_batch32_eplb0_mtp0_1229
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['1229']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
new file mode 100644
index 000000000..68a326b76
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
@@ -0,0 +1,180 @@
+name: ctx3_gen2_dep8_batch512_eplb0_mtp0_8602
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['8602']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
new file mode 100644
index 000000000..8cd72351d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
@@ -0,0 +1,212 @@
+name: ctx3_gen2_dep8_batch768_eplb0_mtp0_12288
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 3
+  prefill_workers: 3
+  gpus_per_prefill: 4
+
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+        - 520
+        - 528
+        - 536
+        - 544
+        - 552
+        - 560
+        - 568
+        - 576
+        - 584
+        - 592
+        - 600
+        - 608
+        - 616
+        - 624
+        - 632
+        - 640
+        - 648
+        - 656
+        - 664
+        - 672
+        - 680
+        - 688
+        - 696
+        - 704
+        - 712
+        - 720
+        - 728
+        - 736
+        - 744
+        - 752
+        - 760
+        - 768
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 768
+      max_num_tokens: 768
+      max_seq_len: 2088
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: ['12288']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
new file mode 100644
index 000000000..6123b194f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
@@ -0,0 +1,130 @@
+name: ctx10_gen1_dep16_batch64_eplb0_mtp1_1229
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 10
+  prefill_workers: 10
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['1229']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
new file mode 100644
index 000000000..3c61eca96
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen4_tep8_batch1_eplb0_mtp3_8
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['8']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
new file mode 100644
index 000000000..539a3f780
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
@@ -0,0 +1,123 @@
+name: ctx1_gen4_tep8_batch4_eplb0_mtp3_24
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 4
+      max_num_tokens: 16
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['24']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
new file mode 100644
index 000000000..49e94caa5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
@@ -0,0 +1,123 @@
+name: ctx6_gen1_dep32_batch8_eplb0_mtp3_333
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 6
+  prefill_workers: 6
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 8
+      max_num_tokens: 32
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['333']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
new file mode 100644
index 000000000..e531467ca
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
@@ -0,0 +1,138 @@
+name: ctx7_gen1_dep8_batch128_eplb0_mtp1_1229
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 128
+      max_num_tokens: 256
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['1229']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
new file mode 100644
index 000000000..fadb3c8c1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
@@ -0,0 +1,126 @@
+name: ctx8_gen1_dep16_batch32_eplb0_mtp3_666
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 8
+  prefill_workers: 8
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['666']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
new file mode 100644
index 000000000..30ba58dcd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
@@ -0,0 +1,117 @@
+name: ctx1_gen4_tep8_batch1_eplb0_mtp0_4
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['4']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
new file mode 100644
index 000000000..091164082
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
@@ -0,0 +1,117 @@
+name: ctx1_gen4_tep8_batch4_eplb0_mtp0_24
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 4
+      max_num_tokens: 4
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['24']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
new file mode 100644
index 000000000..de8d408d1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
@@ -0,0 +1,118 @@
+name: ctx1_gen4_tep8_batch8_eplb0_mtp0_36
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 4
+  decode_nodes: 8
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      allreduce_strategy: MNNVL
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        enable_padding: true
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+      max_batch_size: 8
+      max_num_tokens: 8
+      max_seq_len: 9256
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['36']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
new file mode 100644
index 000000000..70aade3de
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
@@ -0,0 +1,120 @@
+name: ctx4_gen1_dep16_batch32_eplb0_mtp0_666
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  prefill_workers: 4
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['666']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
new file mode 100644
index 000000000..cfe8dead6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
@@ -0,0 +1,118 @@
+name: ctx6_gen1_dep32_batch16_eplb0_mtp0_512
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 6
+  prefill_workers: 6
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.75
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 32
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 32
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['512']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
new file mode 100644
index 000000000..97745e8c8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
@@ -0,0 +1,124 @@
+name: ctx7_gen1_dep16_batch64_eplb0_mtp0_1229
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 64
+      max_num_tokens: 64
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 16
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 16
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['1229']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
new file mode 100644
index 000000000..09e23abed
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
@@ -0,0 +1,148 @@
+name: ctx7_gen1_dep8_batch256_eplb0_mtp0_2151
+
+model:
+  path: "dsr1-fp8"
+  container: "dynamo-trtllm"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 7
+  prefill_workers: 7
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_decode: 8
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    TLLM_OVERRIDE_LAYER_NUM: "61"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    TRTLLM_ENABLE_PDL: "1"
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TRTLLM_FORCE_COMM_METHOD: "NVLINK_TWO_SIDED"
+    ENABLE_CONFIGURABLE_MOE: "1"
+
+  trtllm_config:
+    prefill:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_attention_dp: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.1
+      max_batch_size: 2
+      max_num_tokens: 16384
+      max_seq_len: 8232
+      moe_config:
+        backend: DEEPGEMM
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      tensor_parallel_size: 4
+
+
+    decode:
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      cuda_graph_config:
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        enable_padding: true
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.8
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      moe_config:
+        backend: DEEPGEMM
+        use_low_precision_moe_combine: true
+      moe_expert_parallel_size: 8
+      num_postprocess_workers: 4
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      stream_interval: 100
+      tensor_parallel_size: 8
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: ['2151']
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+
+  enable_multiple_frontends: false
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
+infra:
+  etcd_nats_dedicated_node: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
new file mode 100644
index 000000000..104f3b4ab
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,105 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch32_eplb0_mtp2_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '615'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
new file mode 100644
index 000000000..4c41ec82a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,109 @@
+name: h100_1k1k_ctx1dep16_gen1dep16_batch64_eplb0_mtp1_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1229'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..c3dc14082
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,101 @@
+name: h100_1k1k_ctx1dep16_gen3dep16_batch4_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..8f3663c94
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,114 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch128_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '462'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
new file mode 100644
index 000000000..bd77671ac
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
@@ -0,0 +1,100 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch16_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '60'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..c1fccbc9d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,98 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
new file mode 100644
index 000000000..15c71e8d3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -0,0 +1,98 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..4f261058e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,102 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch32_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '117'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..07de7a34d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_1k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3_chunked_false
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..4a55e5ed8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,97 @@
+name: ctx1dep16_gen3dep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '924'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..2bedf4c23
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,99 @@
+name: ctx1dep16_gen3dep16_batch32_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '1845'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
new file mode 100644
index 000000000..1ff9ace49
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
@@ -0,0 +1,95 @@
+name: ctx1dep16_gen3dep16_batch4_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '231'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 000000000..215e8a6bf
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,96 @@
+name: ctx1dep16_gen3dep16_batch8_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '462'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..4281abed2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,94 @@
+name: ctx1dep16_gen3tep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '60'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..a0e0005e8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,92 @@
+name: ctx1dep16_gen3tep16_batch1_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
new file mode 100644
index 000000000..6eee90d2d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,92 @@
+name: ctx1dep16_gen3tep16_batch2_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 000000000..29e634316
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,93 @@
+name: ctx1dep16_gen3tep16_batch8_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..bb02cdd0a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,127 @@
+name: ctx2dep16_gen1dep16_batch256_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  decode_environment:
+    UCX_TLS: rc,dc,ud,cuda_copy,cuda_ipc,tcp
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 2
+      max_num_tokens: 2048
+      max_seq_len: 2048
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8192
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 1024
+  osl: 1024
+  concurrencies: '4916'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
new file mode 100644
index 000000000..b78cb01af
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
@@ -0,0 +1,101 @@
+name: h100_8k1k_ctx1dep16_gen1dep16_batch4_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 4
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '77'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..dd0ddda85
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,103 @@
+name: h100_8k1k_ctx1dep16_gen2tep16_batch32_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 32
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '78'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..2f0ef4e90
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
new file mode 100644
index 000000000..be3fc74ce
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -0,0 +1,99 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '9'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..6a710bbb5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,100 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '30'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
new file mode 100644
index 000000000..4d746af13
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
@@ -0,0 +1,102 @@
+name: h100_8k1k_ctx2dep16_gen1dep16_batch8_eplb0_mtp3
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '154'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
new file mode 100644
index 000000000..2f630277e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen2tep16_batch64_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 2
+  decode_nodes: 4
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "154"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..9081201ba
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,94 @@
+name: h100_8k1k_ctx1dep16_gen3tep16_batch1_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 1
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '6'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
new file mode 100644
index 000000000..938fd965c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen3tep16_batch2_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 2
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "9"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
new file mode 100644
index 000000000..c1eb86c19
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -0,0 +1,104 @@
+
+
+name: "h100_8k1k_ctx1dep16_gen3tep16_batch8_eplb0_mtp0"
+
+model:
+  path: "DeepSeek-R1-0528"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h100"
+  prefill_workers: 1
+  prefill_nodes: 2
+  decode_workers: 3
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: "1"
+    TRTLLM_FORCE_ALLTOALL_METHOD: "DeepEP"
+
+  decode_environment:
+    NCCL_NVLS_ENABLE: "0"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "n"
+
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: true
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 8
+      max_num_tokens: 256
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8]
+      print_iter_log: true
+      kv_cache_config: 
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config: 
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "30"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # There are errors about colliding on port 8080, and others.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..40c84770f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,97 @@
+name: h100_8k1k_ctx2dep16_gen1dep16_batch16_eplb0_mtp0
+model:
+  path: DeepSeek-R1-0528
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3"
+  precision: fp8
+resources:
+  gpu_type: h100
+  prefill_workers: 2
+  prefill_nodes: 4
+  decode_workers: 1
+  decode_nodes: 2
+  gpus_per_node: 8
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  decode_environment:
+    NCCL_NVLS_ENABLE: '0'
+    UCX_CUDA_IPC_ENABLE_MNNVL: n
+    TRTLLM_ENABLE_PDL: '1'
+    TRTLLM_SERVER_DISABLE_GC: '1'
+    TRTLLM_WORKER_DISABLE_GC: '1'
+    NCCL_GRAPH_MIXING_SUPPORT: '0'
+    TLLM_LOG_LEVEL: INFO
+    TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP: '1'
+    TRTLLM_FORCE_ALLTOALL_METHOD: DeepEP
+  trtllm_config:
+    prefill:
+      max_batch_size: 1
+      max_num_tokens: 8224
+      max_seq_len: 8232
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      pipeline_parallel_size: 1
+      print_iter_log: true
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      enable_chunked_prefill: false
+      moe_config:
+        backend: WIDEEP
+        max_num_tokens: 16384
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.3
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8256
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      pipeline_parallel_size: 1
+      max_batch_size: 16
+      max_num_tokens: 128
+      max_seq_len: 9256
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+      print_iter_log: true
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      moe_config:
+        backend: WIDEEP
+        use_low_precision_moe_combine: true
+      cache_transceiver_config:
+        max_tokens_in_buffer: 8256
+        backend: UCX
+      stream_interval: 100
+      num_postprocess_workers: 4
+benchmark:
+  type: sa-bench
+  isl: 8192
+  osl: 1024
+  concurrencies: '308'
+  req_rate: inf
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..7c3fc7c0e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,107 @@
+name: "c128_ctx1_gen7_dep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_workers: 7
+  decode_nodes: 7
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  trtllm_config:
+    prefill:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "896"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..4feb8690d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,137 @@
+name: "c16_ctx1_gen9_tep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "144"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..522618223
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "c1_ctx1_gen11_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 11
+  decode_nodes: 11
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode, aggressive ctx:gen 1:11 for c=4)
+      # ISL/OSL: 1k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=4, TEP mode)
+      # ISL/OSL: 1k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "13"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..5be701be2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,107 @@
+name: "c256_ctx1_gen4_dep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_workers: 4
+  decode_nodes: 4
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  trtllm_config:
+    prefill:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..6e8464280
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,107 @@
+name: "c32_ctx1_gen11_tep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_workers: 11
+  decode_nodes: 11
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  trtllm_config:
+    prefill:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "352"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..69f96bac7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,135 @@
+name: "c4_ctx1_gen11_tep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 11
+  decode_nodes: 11
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode, aggressive ctx:gen 1:11 for c=4)
+      # ISL/OSL: 1k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=4, TEP mode)
+      # ISL/OSL: 1k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "44"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
new file mode 100644
index 000000000..a7275865f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
@@ -0,0 +1,153 @@
+name: "c512_ctx1_gen2_dep8_batch256_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 2
+  decode_nodes: 2
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=512)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..b68aae478
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,137 @@
+name: "c64_ctx1_gen8_dep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 8
+  decode_nodes: 8
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=64)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
new file mode 100644
index 000000000..506a8c580
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
@@ -0,0 +1,107 @@
+name: "c8_ctx1_gen11_tep8_batch128_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_workers: 11
+  decode_nodes: 11
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  trtllm_config:
+    prefill:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "88"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5d910619d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,182 @@
+name: "c128_ctx1_gen9_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      # Matches E2E standalone ctx_config.yaml
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      # Matches E2E standalone gen_config.yaml (DEP c=128)
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1152"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..a11789b29
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,147 @@
+name: "c16_ctx1_gen9_tep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "144"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..554f516e2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,113 @@
+name: "c1_ctx1_gen9_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "11"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..c48eded81
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,101 @@
+name: "c256_ctx1_gen6_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_workers: 6
+  decode_nodes: 6
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+  trtllm_config:
+    prefill:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1,2,4,8,16,24,32,40,48,56,64,72,80,88,96,104,112,120,128,136,144,152,160,168,176,184,192,200,208,216,224,232,240,248,256,264,272,280,288,296,304,312,320,328,336,344,352,360,368,376,384,392,400,408,416,424,432,440,448,456,464,472,480,488,496,504,512]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1536"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..473753df3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,147 @@
+name: "c32_ctx1_gen9_tep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "288"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..80784e19d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,147 @@
+name: "c4_ctx1_gen9_tep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "36"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..7c695e47f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,182 @@
+name: "c512_ctx2_gen7_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 7
+  decode_nodes: 7
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      # Matches E2E standalone ctx_config.yaml
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      # Matches E2E standalone gen_config.yaml (DEP c=128)
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+        - 264
+        - 272
+        - 280
+        - 288
+        - 296
+        - 304
+        - 312
+        - 320
+        - 328
+        - 336
+        - 344
+        - 352
+        - 360
+        - 368
+        - 376
+        - 384
+        - 392
+        - 400
+        - 408
+        - 416
+        - 424
+        - 432
+        - 440
+        - 448
+        - 456
+        - 464
+        - 472
+        - 480
+        - 488
+        - 496
+        - 504
+        - 512
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "3584"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..69d7b8708
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,147 @@
+name: "c64_ctx1_gen9_tep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "576"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..0c1828f27
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,147 @@
+name: "c8_ctx1_gen9_tep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 9
+  decode_nodes: 9
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 8
+      max_num_tokens: 8192
+      max_seq_len: 1064
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_lm_head_tp_in_adp: false
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 2088
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.9
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 8192
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+        - 1
+        - 2
+        - 4
+        - 8
+        - 16
+        - 24
+        - 32
+        - 40
+        - 48
+        - 56
+        - 64
+        - 72
+        - 80
+        - 88
+        - 96
+        - 104
+        - 112
+        - 120
+        - 128
+        - 136
+        - 144
+        - 152
+        - 160
+        - 168
+        - 176
+        - 184
+        - 192
+        - 200
+        - 208
+        - 216
+        - 224
+        - 232
+        - 240
+        - 248
+        - 256
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "72"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
new file mode 100644
index 000000000..3bacea3c6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,117 @@
+name: "c128_ctx2_gen1_dep8_batch32_eplb0_mtp2"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=128)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
new file mode 100644
index 000000000..eaa4536a4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,117 @@
+name: "c16_ctx1_gen3_tep8_batch32_eplb0_mtp2"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
new file mode 100644
index 000000000..d84bf05a5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "c1_ctx1_gen7_tep8_batch1_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 7
+  decode_nodes: 7
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=4)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 1
+      max_num_tokens: 4
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "9"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
new file mode 100644
index 000000000..19fa4c9f0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,117 @@
+name: "c256_ctx3_gen1_dep8_batch32_eplb0_mtp2"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=256)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..6eca7fe9d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "c32_ctx3_gen5_tep8_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=32)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "160"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..6cfd09aad
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "c4_ctx1_gen7_tep8_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 7
+  decode_nodes: 7
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=4)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "28"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
new file mode 100644
index 000000000..ab5a8fa71
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
@@ -0,0 +1,117 @@
+name: "c512_ctx3_gen1_dep8_batch64_eplb0_mtp1"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=512)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 64
+      max_num_tokens: 256
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32, 64]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
new file mode 100644
index 000000000..219a6f1b8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
@@ -0,0 +1,117 @@
+name: "c64_ctx1_gen1_dep8_batch32_eplb0_mtp2"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=64)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
new file mode 100644
index 000000000..d8dd374c2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
@@ -0,0 +1,117 @@
+name: "c8_ctx1_gen6_tep8_batch32_eplb0_mtp3"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 6
+  decode_nodes: 6
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (MTP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (MTP c=8)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 3
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..b92ecafe9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,114 @@
+name: "c128_ctx1_gen1_dep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      # Matches E2E standalone ctx_config.yaml
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      # Matches E2E standalone gen_config.yaml (DEP c=128)
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..65eddfb81
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c16_ctx1_gen3_tep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=16)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
new file mode 100644
index 000000000..f42e7d15d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c1_ctx1_gen7_tep8_batch1_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 7
+  decode_nodes: 7
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=4)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 1
+      max_num_tokens: 1
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "9"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5f96d875a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c256_ctx5_gen3_dep8_batch256_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 5
+  prefill_workers: 5
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (DEP c=256)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 256
+      max_num_tokens: 256
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "768"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..5f2976b4d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c32_ctx2_gen5_tep8_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 5
+  decode_nodes: 5
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=32)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "160"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
new file mode 100644
index 000000000..72974bb20
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c4_ctx1_gen7_tep8_batch32_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 7
+  decode_nodes: 7
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=4)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 32
+      max_num_tokens: 32
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "28"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
new file mode 100644
index 000000000..a7a96394c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c512_ctx3_gen1_dep8_batch512_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 3
+  prefill_workers: 3
+
+  decode_workers: 1
+  decode_nodes: 1
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (DEP c=512)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 512
+      max_num_tokens: 512
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
new file mode 100644
index 000000000..2a27575f2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c64_ctx2_gen3_dep8_batch128_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+
+  decode_workers: 3
+  decode_nodes: 3
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (DEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (DEP c=64)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_chunked_prefill: false
+      max_batch_size: 128
+      max_num_tokens: 128
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "192"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
new file mode 100644
index 000000000..602646d9c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
@@ -0,0 +1,111 @@
+name: "c8_ctx1_gen6_tep8_batch16_eplb0_mtp0"
+
+model:
+  path: "dsr1"
+  container: "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1"
+  precision: "fp8"
+
+sbatch_directives:
+  cpus-per-gpu: "16"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+
+  decode_workers: 6
+  decode_nodes: 6
+
+  gpus_per_node: 8
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  decode_environment:
+    UCX_TLS: "rc,dc,ud,cuda_copy,cuda_ipc,gdr_copy,tcp"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+
+  trtllm_config:
+    prefill:
+      # Prefill Worker Config for Dynamo DSR1 (TEP mode)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 2
+      max_num_tokens: 16640
+      max_seq_len: 8232
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 32768
+      moe_config:
+        backend: CUTLASS
+      cuda_graph_config: null
+      disable_overlap_scheduler: true
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+    decode:
+      # Decode Worker Config for Dynamo DSR1 (TEP c=8)
+      # ISL/OSL: 8k/1k, TP=8 on H200
+      backend: pytorch
+      trust_remote_code: true
+      tensor_parallel_size: 8
+      moe_expert_parallel_size: 8
+      pipeline_parallel_size: 1
+      enable_attention_dp: false
+      enable_chunked_prefill: false
+      max_batch_size: 16
+      max_num_tokens: 16
+      max_seq_len: 9256
+      kv_cache_config:
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.85
+        dtype: fp8
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      moe_config:
+        backend: CUTLASS
+        use_low_precision_moe_combine: true
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes: [1, 2, 4, 8, 16]
+      disable_overlap_scheduler: false
+      print_iter_log: true
+      # Performance tuning
+      stream_interval: 100
+      num_postprocess_workers: 4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "48"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false # For some reason, the H200 cluster doesn't like nginx.
+
+dynamo:
+  install: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
new file mode 100644
index 000000000..ecdc9233a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-1p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 4096
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 4096
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024x2048x3072x4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
new file mode 100644
index 000000000..43167b5f3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -0,0 +1,98 @@
+name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 1024
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
new file mode 100644
index 000000000..1ab6ca279
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -0,0 +1,98 @@
+name: "kimi-vllm-disagg-gb200-1p4d-dep4-tep4"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 16
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 16
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
new file mode 100644
index 000000000..ca4e9813f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-3p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 3
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 256
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
new file mode 100644
index 000000000..cd9f94a9d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-5p1d-dep4-dep8"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 512
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
new file mode 100644
index 000000000..47d3d7ee5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
@@ -0,0 +1,101 @@
+name: "kimi-vllm-disagg-gb200-6p1d-dep4-dep16"
+
+model:
+  path: "kimi-k2.5-nvfp4"
+  container: "vllm/vllm-openai:v0.18.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "nvidia/Kimi-K2.5-NVFP4"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 512
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "3072x4096"
+  req_rate: "inf"
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index edf5db957..f465b4cdf 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -105,7 +105,7 @@ EOF
     echo "Submitting job with srtctl..."
 
     if [[ -z "$CONFIG_FILE" ]]; then
-        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2
         echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
         exit 1
     fi
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 3c855e805..e7fd1ea49 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -108,7 +108,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 echo "Submitting job with srtctl..."
 
 if [[ -z "$CONFIG_FILE" ]]; then
-    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2
     echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
     exit 1
 fi
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 224c3a928..948689c76 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -125,10 +125,11 @@ PY
 fi
 
 
-# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
-# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs.
+# srt-slurm path requires CONFIG_FILE (set by benchmark-multinode-tmpl.yml from
+# the search-space `recipe:` field). Without it, srtctl apply scans every YAML
+# in the repo and submits hundreds of jobs.
 if [[ -z "$CONFIG_FILE" ]]; then
-    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2
     echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
     exit 1
 fi
@@ -140,21 +141,10 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
-    # Use `cp -rT` so if the upstream branch ever ships a stub
-    # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto
-    # it rather than nesting (`cp -r src dst` would create
-    # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case).
-    mkdir -p recipes/vllm/deepseek-v4
-    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
-elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
-elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then
+# We only clone srt-slurm to install srtctl + pick up its sibling configs
+# (configs/, expert-distributions/, etc). The recipe itself is supplied as an
+# absolute CONFIG_FILE pointing at benchmarks/multi_node/srt-slurm-recipes/.
+if [[ $FRAMEWORK == "dynamo-vllm" || ( $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ) ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 5f48ddcec..9af41a1ef 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -103,7 +103,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 echo "Submitting job with srtctl..."
 
 if [[ -z "$CONFIG_FILE" ]]; then
-    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2
     echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
     exit 1
 fi
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 5a2ab64d2..f531c19bf 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -126,7 +126,7 @@ EOF
     echo "Submitting job with srtctl..."
 
     if [[ -z "$CONFIG_FILE" ]]; then
-        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2
         echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
         exit 1
     fi
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index e11ca7b20..368577d7c 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -119,7 +119,7 @@ EOF
     echo "Submitting job with srtctl..."
 
     if [[ -z "$CONFIG_FILE" ]]; then
-        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+        echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a 'recipe:' field on the search-space entry (resolved by benchmark-multinode-tmpl.yml)." >&2
         echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
         exit 1
     fi
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index e543bb4af..44613e8eb 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -267,6 +267,8 @@ def generate_full_sweep(args, all_config_data, runner_data):
                     seq_len_str = seq_len_to_str(isl, osl)
                     runners_for_entry = runner_nodes_to_use if runner_nodes_to_use else [runner]
 
+                    recipe = bmk.get(Fields.RECIPE.value)
+
                     for runner_value in runners_for_entry:
                         entry = {
                             Fields.IMAGE.value: image,
@@ -285,6 +287,7 @@ def generate_full_sweep(args, all_config_data, runner_data):
                             Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
                             Fields.DISAGG.value: disagg,
                             Fields.RUN_EVAL.value: False,  # Default, may be overridden by mark_eval_entries
+                            Fields.RECIPE.value: recipe,
                         }
 
                         validate_matrix_entry(entry, is_multinode)
@@ -463,6 +466,7 @@ def get_lowest_conc(search_space_entry):
                 Fields.SPEC_DECODING.value, "none")
             prefill_config = lowest_conc_entry[Fields.PREFILL.value]
             decode_config = lowest_conc_entry[Fields.DECODE.value]
+            recipe = lowest_conc_entry.get(Fields.RECIPE.value)
 
             for node in runner_nodes:
                 entry = {
@@ -494,6 +498,7 @@ def get_lowest_conc(search_space_entry):
                     Fields.EXP_NAME.value: f"{model_code}_test",
                     Fields.DISAGG.value: disagg,
                     Fields.RUN_EVAL.value: False,
+                    Fields.RECIPE.value: recipe,
                 }
                 matrix_values.append(validate_matrix_entry(entry, is_multinode=True))
         else:
@@ -620,6 +625,7 @@ def generate_test_config_sweep(args, all_config_data):
                         Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
                         Fields.DISAGG.value: disagg,
                         Fields.RUN_EVAL.value: False,
+                        Fields.RECIPE.value: bmk.get(Fields.RECIPE.value),
                     }
                     matrix_values.append(validate_matrix_entry(entry, is_multinode=True))
                 else:
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index ce10840b5..7f1fa3326 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from pydantic import BaseModel, Field, ValidationError, ConfigDict, model_validator
 from typing import List, Optional, Union, Literal
 from enum import Enum
@@ -5,6 +7,11 @@
 import pprint
 import yaml
 
+# Repo-relative root for first-class srt-slurm recipes referenced by the
+# `recipe:` field on multi-node search-space entries. Resolved against the
+# repository root (parent of utils/) so callers can run from any cwd.
+RECIPES_ROOT = Path(__file__).resolve().parents[2] / "benchmarks" / "multi_node" / "srt-slurm-recipes"
+
 """
     The below class defines the field names expected to be present in the JSON entries
     for both single-node and multi-node configurations.
@@ -44,6 +51,7 @@ class Fields(Enum):
     BATCH_SIZE = 'batch-size'
     MAX_NUM_TOKENS = 'max-num-tokens'
     ADDITIONAL_SETTINGS = 'additional-settings'
+    RECIPE = 'recipe'
 
     # Matrix entry fields
     CONC = 'conc'
@@ -131,6 +139,11 @@ class MultiNodeMatrixEntry(BaseModel):
     run_eval: bool = Field(alias=Fields.RUN_EVAL.value)
     eval_only: bool = Field(alias=Fields.EVAL_ONLY.value, default=False)
     eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value)
+    # Path under benchmarks/multi_node/srt-slurm-recipes/ identifying the
+    # srt-slurm recipe to dispatch. May carry an `:override[N]` suffix that the
+    # launcher strips before resolving the file on disk. Optional because not
+    # every multi-node config uses srt-slurm.
+    recipe: Optional[str] = None
 
 
 def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:
@@ -234,11 +247,31 @@ class MultiNodeSearchSpaceEntry(BaseModel):
         default=None, alias=Fields.CONC_END.value)
     conc_list: Optional[List[int]] = Field(
         default=None, alias=Fields.CONC_LIST.value)
+    # First-class srt-slurm recipe reference. Path is relative to
+    # benchmarks/multi_node/srt-slurm-recipes/ and may carry an
+    # `:override[N]` suffix to select an in-yaml override section.
+    recipe: Optional[str] = None
 
     @model_validator(mode='after')
     def validate_conc_fields(self):
         return _validate_conc_fields(self)
 
+    @model_validator(mode='after')
+    def validate_recipe_exists(self):
+        if self.recipe is None:
+            return self
+        # Strip `:override[...]` suffix used by sglang-style recipes that
+        # carry multiple override sections in one file.
+        recipe_path = self.recipe.split(':', 1)[0]
+        full_path = RECIPES_ROOT / recipe_path
+        if not full_path.is_file():
+            raise ValueError(
+                f"Recipe file not found: '{self.recipe}' "
+                f"(resolved to '{full_path}'). "
+                f"Recipes must live under benchmarks/multi_node/srt-slurm-recipes/."
+            )
+        return self
+
 
 class SingleNodeSeqLenConfig(BaseModel):
     """Single node sequence length configuration."""

From 89bf3e37ca993c91d8d998b3c280962e4360b504 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 09:56:21 -0500
Subject: [PATCH 02/16] runners: factor srt-slurm clone+srtctl install into
 benchmark_lib helper

Six launchers each carried a ~22-line copy of the same git-clone, uv-install,
venv-create, srtctl-install sequence. Lift it into clone_and_install_srtctl()
in benchmarks/benchmark_lib.sh, parameterized by SRT_REPO_URL/SRT_BRANCH and
UV_INSTALL_DIR/UV_VENV_DIR env vars so each launcher can keep its workspace-
vs-NFS-vs-default-HOME placement decisions explicit at the call site.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh       | 49 +++++++++++++++++++++++++++++++
 runners/launch_b200-dgxc.sh       | 29 ++++--------------
 runners/launch_b300-nv.sh         | 29 ++++--------------
 runners/launch_gb200-nv.sh        | 34 +++++----------------
 runners/launch_gb300-nv.sh        | 29 ++++--------------
 runners/launch_h100-dgxc-slurm.sh | 35 +++++-----------------
 runners/launch_h200-dgxc-slurm.sh | 26 ++--------------
 7 files changed, 82 insertions(+), 149 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 268745735..576cf7c4b 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -862,3 +862,52 @@ run_eval() {
     fi
     return $eval_rc
 }
+
+# --------------------------------
+# srt-slurm helpers
+# --------------------------------
+
+# Clone srt-slurm and install `srtctl` into a uv venv. After this returns
+# successfully, cwd is the cloned repo and the venv is active. Idempotent on
+# uv: skips re-curl if the binary is already present at $UV_INSTALL_DIR.
+#
+# All inputs are env vars (set before calling); all are optional:
+#   SRT_REPO_URL    default https://github.com/NVIDIA/srt-slurm.git
+#   SRT_BRANCH      default sa-submission-q2-2026
+#   SRT_REPO_DIR    default srt-slurm (relative to current cwd)
+#   UV_INSTALL_DIR  default $HOME/.local/bin (uv's own default)
+#   UV_VENV_DIR     default .venv (inside the cloned repo)
+clone_and_install_srtctl() {
+    local repo_url="${SRT_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}"
+    local branch="${SRT_BRANCH:-sa-submission-q2-2026}"
+    local repo_dir="${SRT_REPO_DIR:-srt-slurm}"
+    local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}"
+    local uv_venv_dir="${UV_VENV_DIR:-.venv}"
+
+    echo "Cloning ${repo_url}@${branch} into ${repo_dir}..."
+    rm -rf "$repo_dir"
+    git clone "$repo_url" "$repo_dir"
+    cd "$repo_dir" || return 1
+    git checkout "$branch"
+
+    echo "Installing uv + srtctl into venv at ${uv_venv_dir}..."
+    export UV_INSTALL_DIR="$uv_install_dir"
+    mkdir -p "$uv_install_dir"
+    if ! [ -x "$uv_install_dir/uv" ]; then
+        curl -LsSf https://astral.sh/uv/install.sh | sh
+    fi
+    export PATH="$uv_install_dir:$PATH"
+    # uv's installer drops an `env` script next to the binary; source it so
+    # PATH/PS1 changes pick up in shells that don't re-read the env.
+    [ -f "$uv_install_dir/env" ] && source "$uv_install_dir/env"
+
+    uv venv "$uv_venv_dir"
+    # shellcheck disable=SC1091
+    source "$uv_venv_dir/bin/activate"
+    uv pip install -e .
+
+    if ! command -v srtctl &> /dev/null; then
+        echo "Error: Failed to install srtctl" >&2
+        return 1
+    fi
+}
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f465b4cdf..f0ad3deed 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -4,6 +4,8 @@
 SLURM_PARTITION="gpu"
 SLURM_ACCOUNT="benchmark"
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 set -x
 
 if [[ "$IS_MULTINODE" == "true" ]]; then
@@ -29,30 +31,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     fi
     export SERVED_MODEL_NAME=$MODEL
 
-    echo "Cloning srt-slurm repository..."
-    SRT_REPO_DIR="srt-slurm"
-    if [ -d "$SRT_REPO_DIR" ]; then
-        echo "Removing existing $SRT_REPO_DIR..."
-        rm -rf "$SRT_REPO_DIR"
-    fi
-
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR" || exit 1
-    git checkout sa-submission-q2-2026
-
-    echo "Installing srtctl..."
-    export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin"
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    export PATH="$UV_INSTALL_DIR:$PATH"
-
-    uv venv "$GITHUB_WORKSPACE/.venv"
-    source "$GITHUB_WORKSPACE/.venv/bin/activate"
-    uv pip install -e .
-
-    if ! command -v srtctl &> /dev/null; then
-        echo "Error: Failed to install srtctl"
-        exit 1
-    fi
+    UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" \
+    UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \
+        clone_and_install_srtctl || exit 1
 
     # Map container images to local squash files
     NGINX_IMAGE="nginx:1.27.4"
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index e7fd1ea49..6fc373a41 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -4,6 +4,8 @@
 SLURM_PARTITION="batch_1"
 SLURM_ACCOUNT="benchmark"
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 set -x
 
 if [[ "$IS_MULTINODE" == "true" ]]; then
@@ -30,30 +32,9 @@ else
     exit 1
 fi
 
-echo "Cloning srt-slurm repository..."
-SRT_REPO_DIR="srt-slurm"
-if [ -d "$SRT_REPO_DIR" ]; then
-    echo "Removing existing $SRT_REPO_DIR..."
-    rm -rf "$SRT_REPO_DIR"
-fi
-
-git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-cd "$SRT_REPO_DIR" || exit 1
-git checkout sa-submission-q2-2026
-
-echo "Installing srtctl..."
-export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin"
-curl -LsSf https://astral.sh/uv/install.sh | sh
-export PATH="$UV_INSTALL_DIR:$PATH"
-
-uv venv "$GITHUB_WORKSPACE/.venv"
-source "$GITHUB_WORKSPACE/.venv/bin/activate"
-uv pip install -e .
-
-if ! command -v srtctl &> /dev/null; then
-    echo "Error: Failed to install srtctl"
-    exit 1
-fi
+UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" \
+UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \
+    clone_and_install_srtctl || exit 1
 
 # Map container images to local squash files
 NGINX_IMAGE="nginx:1.27.4"
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 948689c76..13bcd9a5d 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -2,6 +2,8 @@
 
 # This script sets up the environment and launches multi-node benchmarks
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 set -x
 
 # MODEL_PATH: Override with pre-downloaded paths on GB200 runner
@@ -134,38 +136,18 @@ if [[ -z "$CONFIG_FILE" ]]; then
     exit 1
 fi
 
-echo "Cloning srt-slurm repository..."
-SRT_REPO_DIR="srt-slurm"
-if [ -d "$SRT_REPO_DIR" ]; then
-    echo "Removing existing $SRT_REPO_DIR..."
-    rm -rf "$SRT_REPO_DIR"
-fi
-
 # We only clone srt-slurm to install srtctl + pick up its sibling configs
 # (configs/, expert-distributions/, etc). The recipe itself is supplied as an
 # absolute CONFIG_FILE pointing at benchmarks/multi_node/srt-slurm-recipes/.
 if [[ $FRAMEWORK == "dynamo-vllm" || ( $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ) ]]; then
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
+    SRT_REPO_URL=https://github.com/NVIDIA/srt-slurm.git
+    SRT_BRANCH=sa-submission-q2-2026
 else
-    git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q1-2026
-fi
-
-echo "Installing srtctl..."
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-
-uv venv
-source .venv/bin/activate
-uv pip install -e .
-
-if ! command -v srtctl &> /dev/null; then
-    echo "Error: Failed to install srtctl"
-    exit 1
+    SRT_REPO_URL=https://github.com/ishandhanani/srt-slurm.git
+    SRT_BRANCH=sa-submission-q1-2026
 fi
+SRT_REPO_URL="$SRT_REPO_URL" SRT_BRANCH="$SRT_BRANCH" \
+    clone_and_install_srtctl || exit 1
 
 echo "Configs available at: $SRT_REPO_DIR/"
 
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 9af41a1ef..58f82eb83 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -2,6 +2,8 @@
 
 # This script sets up the environment and launches multi-node benchmarks
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 set -x
 
 export SLURM_PARTITION="batch"
@@ -36,30 +38,9 @@ export EVAL_ONLY="${EVAL_ONLY:-false}"
 export ISL="$ISL"
 export OSL="$OSL"
 
-echo "Cloning srt-slurm repository..."
-SRT_REPO_DIR="srt-slurm"
-if [ -d "$SRT_REPO_DIR" ]; then
-    echo "Removing existing $SRT_REPO_DIR..."
-    rm -rf "$SRT_REPO_DIR"
-fi
-
-git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-cd "$SRT_REPO_DIR"
-git checkout sa-submission-q2-2026
-
-echo "Installing srtctl..."
-export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin"
-curl -LsSf https://astral.sh/uv/install.sh | sh
-export PATH="$UV_INSTALL_DIR:$PATH"
-
-uv venv "$GITHUB_WORKSPACE/.venv"
-source "$GITHUB_WORKSPACE/.venv/bin/activate"
-uv pip install -e .
-
-if ! command -v srtctl &> /dev/null; then
-    echo "Error: Failed to install srtctl"
-    exit 1
-fi
+UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" \
+UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \
+    clone_and_install_srtctl || exit 1
 
 echo "Configs available at: $SRT_REPO_DIR/"
 
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index f531c19bf..602664a09 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -5,6 +5,8 @@ SLURM_PARTITION="hpc-gpu-1"
 SLURM_ACCOUNT="customer"
 SLURM_EXCLUDED_NODELIST="hpc-gpu-1-7"
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 set -x
 
 if [[ "$IS_MULTINODE" == "true" ]]; then
@@ -34,36 +36,13 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         exit 1
     fi
 
-    echo "Cloning srt-slurm repository..."
-    SRT_REPO_DIR="srt-slurm"
-    if [ -d "$SRT_REPO_DIR" ]; then
-        echo "Removing existing $SRT_REPO_DIR..."
-        rm -rf "$SRT_REPO_DIR"
-    fi
-
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
-
-    echo "Installing srtctl..."
-    export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin"
+    # Pin uv state onto the NFS-shared volume so cluster nodes share a single
+    # cached install, and so the binary persists across runner workspaces.
     export UV_CACHE_DIR="/mnt/nfs/sa-shared/.uv/cache"
     export UV_PYTHON_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/python"
-    mkdir -p "$UV_INSTALL_DIR" "$UV_CACHE_DIR" "$UV_PYTHON_INSTALL_DIR"
-    if ! [ -x "$UV_INSTALL_DIR/uv" ]; then
-        curl -LsSf https://astral.sh/uv/install.sh | sh
-    fi
-    export PATH="$UV_INSTALL_DIR:$PATH"
-    source $UV_INSTALL_DIR/env
-
-    uv venv
-    source .venv/bin/activate
-    uv pip install -e .
-
-    if ! command -v srtctl &> /dev/null; then
-        echo "Error: Failed to install srtctl"
-        exit 1
-    fi
+    mkdir -p "$UV_CACHE_DIR" "$UV_PYTHON_INSTALL_DIR"
+    UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin" \
+        clone_and_install_srtctl || exit 1
 
     echo "Configs available at: $SRT_REPO_DIR/"
 
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 368577d7c..b61cbb0bf 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -4,6 +4,8 @@
 SLURM_PARTITION="main"
 SLURM_ACCOUNT="sa-shared"
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 set -x
 
 if [[ "$IS_MULTINODE" == "true" ]]; then
@@ -33,29 +35,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         exit 1
     fi
 
-    echo "Cloning srt-slurm repository..."
-    SRT_REPO_DIR="srt-slurm"
-    if [ -d "$SRT_REPO_DIR" ]; then
-        echo "Removing existing $SRT_REPO_DIR..."
-        rm -rf "$SRT_REPO_DIR"
-    fi
-
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-    cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
-
-    echo "Installing srtctl..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    source $HOME/.local/bin/env
-
-    uv venv
-    source .venv/bin/activate
-    uv pip install -e .
-
-    if ! command -v srtctl &> /dev/null; then
-        echo "Error: Failed to install srtctl"
-        exit 1
-    fi
+    clone_and_install_srtctl || exit 1
 
     echo "Configs available at: $SRT_REPO_DIR/"
 

From d29d06615d71747d469a082a923c0d65338e4005 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 10:09:08 -0500
Subject: [PATCH 03/16] runners: factor image-filename sanitization into
 benchmark_lib helper

Lift the `echo "$IMAGE" | sed 's/[/:@#]/_/g'` slug used to name squash files
out of 13 launchers and into sanitize_image_filename() in benchmark_lib.sh.
Cluster-specific separator (h100/h200-dgxc-slurm use '+' instead of '_') is
expressed as the second arg, and the nvcr.io/-prefix-strip variant becomes
`sanitize_image_filename "${IMAGE#nvcr.io/}" +` rather than a sed pipeline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh       | 14 ++++++++++++++
 runners/launch_b200-cw.sh         |  4 +++-
 runners/launch_b200-dgxc.sh       |  6 +++---
 runners/launch_b300-nv.sh         |  6 +++---
 runners/launch_gb200-nv.sh        |  4 ++--
 runners/launch_gb300-nv.sh        |  4 ++--
 runners/launch_h100-cw.sh         |  4 +++-
 runners/launch_h100-dgxc-slurm.sh |  4 ++--
 runners/launch_h200-cw.sh         |  4 +++-
 runners/launch_h200-dgxc-slurm.sh |  6 +++---
 runners/launch_h200-nb.sh         |  4 +++-
 runners/launch_mi300x-amds.sh     |  4 +++-
 runners/launch_mi325x-amds.sh     |  4 +++-
 runners/launch_mi355x-amds.sh     |  4 +++-
 14 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 576cf7c4b..92998de27 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -863,6 +863,20 @@ run_eval() {
     return $eval_rc
 }
 
+# --------------------------------
+# Container helpers
+# --------------------------------
+
+# Sanitize a container image reference (e.g. "lmsysorg/sglang:v0.5.8-cu130")
+# into a filename-safe slug by replacing /, :, @, # with the chosen separator.
+# Defaults to '_' (most clusters); pass '+' for clusters that adopted that
+# convention for their squash-file directory.
+sanitize_image_filename() {
+    local image="$1"
+    local sep="${2:-_}"
+    echo "$image" | sed "s|[/:@#]|${sep}|g"
+}
+
 # --------------------------------
 # srt-slurm helpers
 # --------------------------------
diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh
index 0b2dbf305..fbdd60554 100644
--- a/runners/launch_b200-cw.sh
+++ b/runners/launch_b200-cw.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache"
 export PORT=8888
 
@@ -16,7 +18,7 @@ if [[ ! -f "$BENCH_SCRIPT" ]]; then
 fi
 
 PARTITION="b200"
-SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/tmp/gharunner/squash/$(sanitize_image_filename "$IMAGE").sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f0ad3deed..3e294f859 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -37,8 +37,8 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     # Map container images to local squash files
     NGINX_IMAGE="nginx:1.27.4"
-    SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-    NGINX_SQUASH_FILE="/home/sa-shared/containers/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/home/sa-shared/containers/$(sanitize_image_filename "$IMAGE").sqsh"
+    NGINX_SQUASH_FILE="/home/sa-shared/containers/$(sanitize_image_filename "$NGINX_IMAGE").sqsh"
 
     # Import containers via enroot
     enroot import -o $SQUASH_FILE docker://$IMAGE
@@ -231,7 +231,7 @@ EOF
 else
 
     HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
-    SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/home/sa-shared/containers/$(sanitize_image_filename "$IMAGE").sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     # Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 6fc373a41..23f75ac80 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -38,8 +38,8 @@ UV_VENV_DIR="$GITHUB_WORKSPACE/.venv" \
 
 # Map container images to local squash files
 NGINX_IMAGE="nginx:1.27.4"
-SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-NGINX_SQUASH_FILE="/data/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/data/squash/$(sanitize_image_filename "$IMAGE").sqsh"
+NGINX_SQUASH_FILE="/data/squash/$(sanitize_image_filename "$NGINX_IMAGE").sqsh"
 
 # Import containers via enroot
 srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
@@ -239,7 +239,7 @@ else
     elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
         export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
     fi
-    SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(sanitize_image_filename "$IMAGE").sqsh"
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models
     # with multiple inference engines can coexist; fall back to the historical
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 13bcd9a5d..e9c3e62b8 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -64,8 +64,8 @@ export SLURM_ACCOUNT="benchmark"
 
 NGINX_IMAGE="nginx:1.27.4"
 
-SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(sanitize_image_filename "$IMAGE").sqsh"
+NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(sanitize_image_filename "$NGINX_IMAGE").sqsh"
 
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 58f82eb83..a0790260e 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -27,8 +27,8 @@ fi
 
 NGINX_IMAGE="nginx:1.27.4"
 
-SQUASH_FILE="/home/sa-shared/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/home/sa-shared/squash/$(sanitize_image_filename "$IMAGE").sqsh"
+NGINX_SQUASH_FILE="/home/sa-shared/squash/$(sanitize_image_filename "$NGINX_IMAGE").sqsh"
 
 srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
 srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE"
diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index f3198ca8c..e036e6219 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -1,8 +1,10 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache"
 PARTITION="h100"
-SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/mnt/vast/gharunner/squash/$(sanitize_image_filename "$IMAGE").sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 set -x
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index 602664a09..f95816448 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     elif [[ $FRAMEWORK == "dynamo-trt" ]]; then
         # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#)
         CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|')
-        SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh"
+        SQUASH_FILE="/mnt/nfs/sa-shared/containers/$(sanitize_image_filename "${IMAGE#nvcr.io/}" +).sqsh"
     fi
 
     export ISL="$ISL"
@@ -249,7 +249,7 @@ EOF
 else
 
     HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/"
-    SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/mnt/nfs/lustre/containers/$(sanitize_image_filename "$IMAGE").sqsh"
 
     salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 84b40480c..08bbbc757 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache"
 export PORT=8888
 
@@ -8,7 +10,7 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
 SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="h200"
-SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/mnt/vast/gharunner/squash/$(sanitize_image_filename "$IMAGE").sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 set -x
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index b61cbb0bf..71a64025f 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -44,12 +44,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
         # SGLang container mapping
-        SQUASH_FILE="/data/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/+/g').sqsh"
+        SQUASH_FILE="/data/containers/$(sanitize_image_filename "$IMAGE" +).sqsh"
         CONTAINER_KEY="$IMAGE"
     elif [[ $FRAMEWORK == "dynamo-trt" ]]; then
         # TRT-LLM container mapping - convert IMAGE to srt-slurm format (nvcr.io/ -> nvcr.io#)
         CONTAINER_KEY=$(echo "$IMAGE" | sed 's|nvcr.io/|nvcr.io#|')
-        SQUASH_FILE="/data/containers/$(echo "$IMAGE" | sed 's|nvcr.io/||' | sed 's/[\/:@#]/+/g').sqsh"
+        SQUASH_FILE="/data/containers/$(sanitize_image_filename "${IMAGE#nvcr.io/}" +).sqsh"
     fi
 
     export ISL="$ISL"
@@ -242,7 +242,7 @@ EOF
 else
 
     HF_HUB_CACHE_MOUNT="/models/gharunners/hf-hub-cache"
-    SQUASH_FILE="/data/gharunners/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/data/gharunners/containers/$(sanitize_image_filename "$IMAGE").sqsh"
 
     # Convert pyxis image format (nvcr.io#path) to docker format (nvcr.io/path) for enroot import
     DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g')
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 9d157a858..849f73699 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
 export PORT=8888
 
@@ -12,7 +14,7 @@ PARTITION="main"
 set -x
 srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
 --container-image=$IMAGE \
---container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
+--container-name=$(sanitize_image_filename "$IMAGE")-${USER} \
 --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 --container-remap-root \
 --container-writable \
diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh
index b654c515a..da98f3015 100644
--- a/runners/launch_mi300x-amds.sh
+++ b/runners/launch_mi300x-amds.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 export HF_HUB_CACHE_MOUNT="/raid/hf-hub-cache/"
 export PORT=8888
 
 PARTITION="compute"
-SQUASH_FILE="/home/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/home/gharunner/gharunners/squash/$(sanitize_image_filename "$IMAGE").sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 set -x
diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh
index 67f93a309..200b46838 100644
--- a/runners/launch_mi325x-amds.sh
+++ b/runners/launch_mi325x-amds.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 export HF_HUB_CACHE_MOUNT="/nfsdata/sa/gharunner/gharunners/hf-hub-cache/"
 export PORT=8888
 
 PARTITION="compute"
-SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/nfsdata/sa/gharunner/gharunners/squash/$(sanitize_image_filename "$IMAGE").sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 set -x
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 152745d4e..a14cfdb2c 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/../benchmarks/benchmark_lib.sh"
+
 scancel_sync() {
     local jobid=$1
     local timeout=${2:-600}
@@ -182,7 +184,7 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
     PARTITION="compute"
-    SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/var/lib/squash/$(sanitize_image_filename "$IMAGE").sqsh"
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     set -x

From 77de8570d26edfbac0e4cdbcca75ee11be4211c4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 10:54:32 -0500
Subject: [PATCH 04/16] srt-slurm: reorganize recipes by
 model/framework/hw/seq-len/topology

Restructure benchmarks/multi_node/srt-slurm-recipes/ from the upstream's
heterogeneous layout into a uniform tree:

  <model>/<framework>/<hw>-<precision>/<isl>/<osl>/<agg|disagg>/<stp|mtp>/<recipe>.yaml

so a sweep contributor can navigate to dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/
and immediately see every recipe that fits that cell. The 3 sglang
multi-override files that span both stp and mtp are parked one level
shallower (no trailing stp|mtp/), since the override section selects spec.

365 files moved, 388 active + 5 commented recipe references rewritten,
schema validation + tests still green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/CONFIGS.md                    |   4 +-
 .github/configs/nvidia-master.yaml            | 786 +++++++++---------
 .../sglang/b200-fp4/1k/1k/disagg}/1k1k.yaml   |   0
 .../sglang/b200-fp4/8k/1k/disagg}/8k1k.yaml   |   0
 .../sglang/b200-fp8/1k/1k/disagg}/1k1k.yaml   |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_lowlat_0.yaml  |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_lowlat_1.yaml  |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_lowlat_2.yaml  |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_0.yaml  |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_1.yaml  |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_2.yaml  |   0
 .../8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_3.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_lowlat_0.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_lowlat_1.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_lowlat_2.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_0.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_1.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_2.yaml  |   0
 .../8k/1k/disagg/stp}/8k1k_stp_maxtpt_3.yaml  |   0
 .../1k/1k/disagg/stp}/low-latency.yaml        |   0
 .../gb200-fp4/1k/1k/disagg/stp}/max-tpt.yaml  |   0
 .../1k/1k/disagg/stp}/mid-curve.yaml          |   0
 .../8k/1k/disagg/stp}/low-latency.yaml        |   0
 .../gb200-fp4/8k/1k/disagg/stp}/max-tpt.yaml  |   0
 .../8k/1k/disagg/stp}/mid-curve.yaml          |   0
 .../1k/1k/disagg/stp}/low-latency.yaml        |   0
 .../gb200-fp8/1k/1k/disagg/stp}/max-tpt.yaml  |   0
 .../1k/1k/disagg/stp}/mid-curve.yaml          |   0
 .../1k/1k/disagg/stp}/ultra-tpt.yaml          |   0
 .../8k/1k/disagg/stp}/low-latency.yaml        |   0
 .../gb200-fp8/8k/1k/disagg/stp}/max_tpt.yaml  |   0
 .../8k/1k/disagg/stp}/mid-curve.yaml          |   0
 .../1k/1k/disagg/stp}/low_latency.yaml        |   0
 .../gb300-fp4/1k/1k/disagg/stp}/max_tpt.yaml  |   0
 .../1k/1k/disagg/stp}/mid_curve.yaml          |   0
 .../8k/1k/disagg/stp}/low_latency.yaml        |   0
 .../gb300-fp4/8k/1k/disagg/stp}/max_tpt.yaml  |   0
 .../8k/1k/disagg/stp}/mid_curve.yaml          |   0
 .../1k/1k/disagg}/stp/low-latency.yaml        |   0
 .../gb300-fp8/1k/1k/disagg}/stp/max.yaml      |   0
 .../gb300-fp8/1k/1k/disagg}/stp/mid.yaml      |   0
 .../8k/1k/disagg}/stp/low-latency.yaml        |   0
 .../gb300-fp8/8k/1k/disagg}/stp/max.yaml      |   0
 .../gb300-fp8/8k/1k/disagg}/stp/mid.yaml      |   0
 .../mtp/h100-fp8-1p1d-max-dep-mtp.yaml        |   0
 .../disagg}/mtp/h100-fp8-1p2d-max-tp-mtp.yaml |   0
 .../1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml |   0
 .../1k/disagg}/stp/h100-fp8-1p2d-max-tp.yaml  |   0
 .../mtp/h100-fp8-1p1d-max-dep-mtp.yaml        |   0
 .../disagg}/mtp/h100-fp8-1p1d-max-tp-mtp.yaml |   0
 .../1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml |   0
 .../1k/disagg}/stp/h100-fp8-1p1d-max-tp.yaml  |   0
 .../1k/1k/disagg/mtp}/bs256-1p6d-dep-mtp.yaml |   0
 .../1k/1k/disagg/mtp}/bs256-1p6d-tp-mtp.yaml  |   0
 .../1k/disagg/mtp}/low-latency-1p9d-mtp.yaml  |   0
 .../1k/1k/disagg/stp}/bs256-1p6d-dep.yaml     |   0
 .../1k/1k/disagg/stp}/bs256-1p6d-tp.yaml      |   0
 .../1k/1k/disagg/stp}/low-latency-1p9d.yaml   |   0
 .../8k/1k/disagg/mtp}/bs128-1p1d-dep-mtp.yaml |   0
 .../8k/1k/disagg/mtp}/bs16-1p3d-mtp.yaml      |   0
 .../8k/1k/disagg/mtp}/bs4-1p7d-mtp.yaml       |   0
 .../8k/1k/disagg/mtp}/bs64-2p3d-mtp.yaml      |   0
 .../8k/1k/disagg/mtp}/bs8-1p6d-mtp.yaml       |   0
 .../8k/1k/disagg/stp}/bs128-1p1d-dep.yaml     |   0
 .../h200-fp8/8k/1k/disagg/stp}/bs16-1p3d.yaml |   0
 .../h200-fp8/8k/1k/disagg/stp}/bs4-1p7d.yaml  |   0
 .../h200-fp8/8k/1k/disagg/stp}/bs64-2p3d.yaml |   0
 .../h200-fp8/8k/1k/disagg/stp}/bs8-1p6d.yaml  |   0
 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml    |   0
 .../ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |   0
 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml    |   0
 .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml   |   0
 .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml   |   0
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml   |   0
 .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml   |   0
 .../ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml    |   0
 .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml |   0
 .../ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml    |   0
 .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml   |   0
 .../ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml    |   0
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml |   0
 .../ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml    |   0
 .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml   |   0
 .../ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml   |   0
 ...x1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml |   0
 ...x1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml |   0
 ...x1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml |   0
 ...tx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml |   0
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml    |   0
 .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml |   0
 .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml   |   0
 .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml   |   0
 ...x1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml |   0
 ...tx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml |   0
 ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml |   0
 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml |   0
 ...tx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml |   0
 ...x2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml |   0
 .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml   |   0
 .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml  |   0
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml   |   0
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml    |   0
 ...ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml |   0
 .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml |   0
 ...x4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml |   0
 ...tx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml |   0
 ...tx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml |   0
 .../ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml    |   0
 ...ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml |   0
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml |   0
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml  |   0
 .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml  |   0
 ...tx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml |   0
 .../ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml    |   0
 .../ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |   0
 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml    |   0
 .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml   |   0
 .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml   |   0
 .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml   |   0
 .../ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml |   0
 .../ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml   |   0
 .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml  |   0
 .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml   |   0
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml  |   0
 .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml |   0
 .../ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml    |   0
 .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml   |   0
 .../ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml |   0
 .../ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml   |   0
 .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml   |   0
 ...tx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml |   0
 ...x1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml |   0
 ...ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml |   0
 .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml |   0
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml   |   0
 ...x3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml |   0
 ...x1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml |   0
 ...tx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml |   0
 ...ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml |   0
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml  |   0
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml |   0
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml |   0
 ...2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml |   0
 .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml   |   0
 .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml  |   0
 .../ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml    |   0
 .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml   |   0
 .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml |   0
 .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml |   0
 .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml  |   0
 .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml   |   0
 .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml |   0
 .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml |   0
 .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml |   0
 ...tx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml |   0
 ...x7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml |   0
 .../ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |   0
 ...ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml |   0
 .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml |   0
 .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml   |   0
 .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml   |   0
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml   |   0
 .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml   |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml    |   0
 ...ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml |   0
 .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml   |   0
 ...tx11_gen1_dep16_batch256_eplb256_mtp1.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |   0
 .../ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml    |   0
 .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml |   0
 .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml   |   0
 ...tx10_gen1_dep16_batch256_eplb256_mtp0.yaml |   0
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml    |   0
 .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml   |   0
 .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml  |   0
 ...x1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml |   0
 ...tx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml |   0
 ...x1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml |   0
 ...x1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml |   0
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml   |   0
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml  |   0
 .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml  |   0
 ...1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml |   0
 ...x1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml |   0
 ...ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml |   0
 ...x1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml |   0
 ...x1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml |   0
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml   |   0
 .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml  |   0
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml   |   0
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml  |   0
 .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml |   0
 ...tx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml |   0
 ...ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml |   0
 ...ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml |   0
 ...tx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml |   0
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml   |   0
 .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml  |   0
 ...ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml |   0
 ...tx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml |   0
 ...tx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml |   0
 ...x5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml |   0
 .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml |   0
 .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml   |   0
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |   0
 ...ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml |   0
 .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml   |   0
 .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml  |   0
 ...ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml |   0
 .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml   |   0
 .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml  |   0
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml  |   0
 ...ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml |   0
 .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml |   0
 .../ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml    |   0
 .../ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml    |   0
 .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml  |   0
 .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml |   0
 .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml   |   0
 .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml   |   0
 ...tx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml |   0
 ...ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml  |   0
 ...2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml |   0
 ...tx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml |   0
 ...x3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml |   0
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml  |   0
 ...2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml |   0
 ...x2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml |   0
 ...x3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml |   0
 ...3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml |   0
 ...10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml  |   0
 ...ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml |   0
 ...x7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml |   0
 ...tx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml  |   0
 .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml  |   0
 ...tx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml |   0
 ...tx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml |   0
 ...x7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml |   0
 ...x7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml |   0
 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml   |   0
 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml   |   0
 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml  |   0
 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml   |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml  |   0
 .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml    |   0
 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |   0
 .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml    |   0
 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml   |   0
 ...28_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml |   0
 ...16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml |   0
 .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml |   0
 ...56_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml |   0
 ...2_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |   0
 ...4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |   0
 ...12_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml |   0
 ...64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml |   0
 ...8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |   0
 ...28_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml |   0
 ...16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml  |   0
 ...56_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml |   0
 ...32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...12_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml |   0
 ...64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml |   0
 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml |   0
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml  |   0
 ...256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml |   0
 ...c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml |   0
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml |   0
 ...512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml |   0
 ...c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml |   0
 .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml |   0
 ...28_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml |   0
 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml |   0
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml  |   0
 ...56_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml |   0
 ...32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml |   0
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml |   0
 ...12_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml |   0
 ...64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml |   0
 .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml |   0
 .../stp}/disagg-gb200-1p1d-dep8-dep16.yaml    |   0
 .../stp}/disagg-gb200-1p1d-dep8-tep8.yaml     |   0
 .../stp}/disagg-gb200-3p1d-dep8-dep16.yaml    |   0
 .../stp}/disagg-gb200-1p1d-dep8-tep8.yaml     |   0
 .../stp}/disagg-gb200-3p1d-dep8-dep16.yaml    |   0
 .../stp}/disagg-gb200-7p1d-dep8-dep16.yaml    |   0
 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml |   0
 ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml |   0
 ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml |   0
 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml |   0
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml |   0
 ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml |   0
 ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml |   0
 ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml |   0
 ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml |   0
 ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml |   0
 ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml |   0
 ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml |   0
 ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml |   0
 ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml |   0
 .../stp}/disagg-gb200-1p1d-dep4-dep16.yaml    |   0
 .../stp}/disagg-gb200-1p4d-dep4-tep4.yaml     |   0
 .../stp}/disagg-gb200-1p4d-dep4-tep4.yaml     |   0
 .../stp}/disagg-gb200-3p1d-dep4-dep16.yaml    |   0
 .../stp}/disagg-gb200-5p1d-dep4-dep8.yaml     |   0
 .../stp}/disagg-gb200-6p1d-dep4-dep16.yaml    |   0
 372 files changed, 395 insertions(+), 395 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp4 => dsr1/sglang/b200-fp4/1k/1k/disagg}/1k1k.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp4 => dsr1/sglang/b200-fp4/8k/1k/disagg}/8k1k.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/1k/1k/disagg}/1k1k.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_lowlat_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_lowlat_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_lowlat_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/mtp}/8k1k_mtp_maxtpt_3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_lowlat_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_lowlat_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_lowlat_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{b200-fp8 => dsr1/sglang/b200-fp8/8k/1k/disagg/stp}/8k1k_stp_maxtpt_3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/1k1k => dsr1/sglang/gb200-fp4/1k/1k/disagg/stp}/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/1k1k => dsr1/sglang/gb200-fp4/1k/1k/disagg/stp}/max-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/1k1k => dsr1/sglang/gb200-fp4/1k/1k/disagg/stp}/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/8k1k => dsr1/sglang/gb200-fp4/8k/1k/disagg/stp}/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/8k1k => dsr1/sglang/gb200-fp4/8k/1k/disagg/stp}/max-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp4/8k1k => dsr1/sglang/gb200-fp4/8k/1k/disagg/stp}/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/max-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/1k1k => dsr1/sglang/gb200-fp8/1k/1k/disagg/stp}/ultra-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/8k1k => dsr1/sglang/gb200-fp8/8k/1k/disagg/stp}/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/8k1k => dsr1/sglang/gb200-fp8/8k/1k/disagg/stp}/max_tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb200-fp8/8k1k => dsr1/sglang/gb200-fp8/8k/1k/disagg/stp}/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/1k1k => dsr1/sglang/gb300-fp4/1k/1k/disagg/stp}/low_latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/1k1k => dsr1/sglang/gb300-fp4/1k/1k/disagg/stp}/max_tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/1k1k => dsr1/sglang/gb300-fp4/1k/1k/disagg/stp}/mid_curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/8k1k => dsr1/sglang/gb300-fp4/8k/1k/disagg/stp}/low_latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/8k1k => dsr1/sglang/gb300-fp4/8k/1k/disagg/stp}/max_tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp4/8k1k => dsr1/sglang/gb300-fp4/8k/1k/disagg/stp}/mid_curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/1k1k => dsr1/sglang/gb300-fp8/1k/1k/disagg}/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/1k1k => dsr1/sglang/gb300-fp8/1k/1k/disagg}/stp/max.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/1k1k => dsr1/sglang/gb300-fp8/1k/1k/disagg}/stp/mid.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/8k1k => dsr1/sglang/gb300-fp8/8k/1k/disagg}/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/8k1k => dsr1/sglang/gb300-fp8/8k/1k/disagg}/stp/max.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{gb300-fp8/8k1k => dsr1/sglang/gb300-fp8/8k/1k/disagg}/stp/mid.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/mtp/h100-fp8-1p2d-max-tp-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/1k1k => dsr1/sglang/h100-fp8/1k/1k/disagg}/stp/h100-fp8-1p2d-max-tp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/mtp/h100-fp8-1p1d-max-tp-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/stp/h100-fp8-1p1d-max-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h100/8k1k => dsr1/sglang/h100-fp8/8k/1k/disagg}/stp/h100-fp8-1p1d-max-tp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/mtp}/bs256-1p6d-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/mtp}/bs256-1p6d-tp-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/mtp}/low-latency-1p9d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/stp}/bs256-1p6d-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/stp}/bs256-1p6d-tp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/1k1k => dsr1/sglang/h200-fp8/1k/1k/disagg/stp}/low-latency-1p9d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs128-1p1d-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs16-1p3d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs4-1p7d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs64-2p3d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/mtp}/bs8-1p6d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs128-1p1d-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs16-1p3d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs4-1p7d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs64-2p3d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{h200/8k1k => dsr1/sglang/h200-fp8/8k/1k/disagg/stp}/bs8-1p6d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/1k1k => dsr1/trtllm/b200-fp4/1k/1k/disagg}/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp4/8k1k => dsr1/trtllm/b200-fp4/8k/1k/disagg}/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/1k1k => dsr1/trtllm/b200-fp8/1k/1k/disagg}/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b200-fp8/8k1k => dsr1/trtllm/b200-fp8/8k/1k/disagg}/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/1k1k => dsr1/trtllm/b300-fp4/1k/1k/disagg}/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp4/8k1k => dsr1/trtllm/b300-fp4/8k/1k/disagg}/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/1k1k => dsr1/trtllm/b300-fp8/1k/1k/disagg}/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/b300-fp8/8k1k => dsr1/trtllm/b300-fp8/8k/1k/disagg}/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/1k1k => dsr1/trtllm/gb200-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp4/8k1k => dsr1/trtllm/gb200-fp4/8k/1k/disagg}/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/1k1k => dsr1/trtllm/gb200-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb200-fp8/8k1k => dsr1/trtllm/gb200-fp8/8k/1k/disagg}/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/1k1k => dsr1/trtllm/gb300-fp4/1k/1k/disagg}/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp4/8k1k => dsr1/trtllm/gb300-fp4/8k/1k/disagg}/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/1k1k => dsr1/trtllm/gb300-fp8/1k/1k/disagg}/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/gb300-fp8/8k1k => dsr1/trtllm/gb300-fp8/8k/1k/disagg}/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/1k1k => dsr1/trtllm/h100-fp8/1k/1k/disagg}/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h100-fp8/8k1k => dsr1/trtllm/h100-fp8/8k/1k/disagg}/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/1k1k => dsr1/trtllm/h200-fp8/1k/1k/disagg}/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{trtllm/h200/8k1k => dsr1/trtllm/h200-fp8/8k/1k/disagg}/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/1k1k => dsv4/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/1k1k => dsv4/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p1d-dep8-tep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/1k1k => dsv4/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/8k1k => dsv4/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-1p1d-dep8-tep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/8k1k => dsv4/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/deepseek-v4/8k1k => dsv4/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-7p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp}/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP => kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp}/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/1k1k => kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p1d-dep4-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/1k1k => kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp}/disagg-gb200-1p4d-dep4-tep4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-1p4d-dep4-tep4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-3p1d-dep4-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-5p1d-dep4-dep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/{vllm/kimi-k2.5/8k1k => kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp}/disagg-gb200-6p1d-dep4-dep16.yaml (100%)

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
index f383f20ba..482c9acfc 100644
--- a/.github/configs/CONFIGS.md
+++ b/.github/configs/CONFIGS.md
@@ -69,9 +69,9 @@ search-space:
 ```
 
 - `recipe` is a path **relative to `benchmarks/multi_node/srt-slurm-recipes/`** in this repo. The schema validator rejects entries whose recipe file does not exist on disk, so adding a new entry requires upstreaming the recipe yaml here first.
-- The path may carry an `:override[N]` / `:override_<name>` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`.
+- The path may carry an `:override[N]` / `:override_<name>` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`.
 - `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset.
-- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` mirroring the upstream NVIDIA/srt-slurm `recipes/` layout (e.g. `trtllm/b200-fp4/...`, `vllm/deepseek-v4/...`, `gb200-fp4/...`). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form.
+- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `<model>/<framework>/<hw>-<precision>/<isl>/<osl>/<agg|disagg>/<stp|mtp>/<recipe-name>.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `<stp|mtp>/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form.
 
 ## Runners
 
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4a03b1c0f..bb59f1dd0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13,7 +13,7 @@ dsr1-fp4-b200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [1214]
-      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -26,7 +26,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [875]
-      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -39,7 +39,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -52,7 +52,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 15, 25, 45, 90, 180]
-      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -65,7 +65,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 4968 ]
-      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -78,7 +78,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [10860]
-      recipe: "trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -92,7 +92,7 @@ dsr1-fp4-b200-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [4096]
-      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -104,7 +104,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2192]
-      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -116,7 +116,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [1365]
-      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -128,7 +128,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [6]
-      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -140,7 +140,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [10, 15, 25, 45, 90, 180]
-      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -152,7 +152,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [450]
-      recipe: "trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -169,7 +169,7 @@ dsr1-fp4-b200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [90]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -182,7 +182,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [66]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -195,7 +195,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -208,7 +208,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 15, 30, 60]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -221,7 +221,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [548]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -234,7 +234,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1096, 1691]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -247,7 +247,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [658]
-      recipe: "trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -261,7 +261,7 @@ dsr1-fp4-b200-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [6]
-      recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -273,7 +273,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [10, 15, 25, 50, 100]
-      recipe: "trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -285,7 +285,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [370]
-      recipe: "trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -297,7 +297,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [1606]
-      recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -309,7 +309,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [837]
-      recipe: "trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -321,7 +321,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2222]
-      recipe: "trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -349,7 +349,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - Low latency (TP attention)
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -362,7 +362,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -375,7 +375,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -388,7 +388,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [256]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -402,7 +402,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - High throughput (DP attention)
     - spec-decoding: "mtp"
       conc-list: [896]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -415,7 +415,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1024]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -428,7 +428,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1184]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -441,7 +441,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1600]
-      recipe: "trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -455,7 +455,7 @@ dsr1-fp8-b200-dynamo-trt:
 
     # Non-MTP (STP) configurations - Low latency (TP attention)
     - conc-list: [4]
-      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -467,7 +467,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [32]
-      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -479,7 +479,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -492,7 +492,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     # Non-MTP (STP) configurations - High throughput (DP attention)
     - conc-list: [1920]
-      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -504,7 +504,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [4096]
-      recipe: "trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -516,7 +516,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [5152]
-      recipe: "trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -534,7 +534,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - Low latency (TP attention)
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -547,7 +547,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -560,7 +560,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [48]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -573,7 +573,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -587,7 +587,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - High throughput (DP attention)
     - spec-decoding: "mtp"
       conc-list: [224]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -600,7 +600,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [288]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -613,7 +613,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1088]
-      recipe: "trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml"
       prefill:
         num-worker: 4
         tp: 8
@@ -627,7 +627,7 @@ dsr1-fp8-b200-dynamo-trt:
 
     # Non-MTP (STP) configurations - Low latency (TP attention)
     - conc-list: [1]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -639,7 +639,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [32]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -651,7 +651,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -663,7 +663,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [96]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -676,7 +676,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     # Non-MTP (STP) configurations - High throughput (DP attention)
     - conc-list: [128]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -688,7 +688,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [128]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -700,7 +700,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [256]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -712,7 +712,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [640]
-      recipe: "trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -740,7 +740,7 @@ dsr1-fp4-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [654]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -753,7 +753,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [271]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -766,7 +766,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [11]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -779,7 +779,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 20, 25, 60, 120, 200]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -792,7 +792,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [2342]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -805,7 +805,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [8609]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 2
@@ -818,7 +818,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [12926]
-      recipe: "trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 2
@@ -832,7 +832,7 @@ dsr1-fp4-b300-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [1176]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -844,7 +844,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [6]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -856,7 +856,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [5, 10, 15, 25]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -868,7 +868,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [60, 110, 195, 395]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -880,7 +880,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4405]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -892,7 +892,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [8192]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -904,7 +904,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [4611]
-      recipe: "trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -921,7 +921,7 @@ dsr1-fp4-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [2198]
-      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 10
         tp: 2
@@ -934,7 +934,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [52]
-      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -947,7 +947,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -960,7 +960,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -973,7 +973,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [181]
-      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -986,7 +986,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1197]
-      recipe: "trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml"
       prefill:
         num-worker: 9
         tp: 2
@@ -1000,7 +1000,7 @@ dsr1-fp4-b300-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [105]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1012,7 +1012,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [63]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1024,7 +1024,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1036,7 +1036,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [12]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1048,7 +1048,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [589]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 2
@@ -1060,7 +1060,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [1093]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 6
         tp: 2
@@ -1072,7 +1072,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2048]
-      recipe: "trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 8
         tp: 2
@@ -1100,7 +1100,7 @@ dsr1-fp8-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [10]
-      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1113,7 +1113,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [160]
-      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1126,7 +1126,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [3072]
-      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1139,7 +1139,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2560]
-      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1152,7 +1152,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [720]
-      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1165,7 +1165,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [11264]
-      recipe: "trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -1181,7 +1181,7 @@ dsr1-fp8-b300-dynamo-trt:
     osl: 1024
     search-space:
     - conc-list: [2112]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1193,7 +1193,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [3072]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1205,7 +1205,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [1280]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1217,7 +1217,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [12]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1229,7 +1229,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1241,7 +1241,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [384]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1253,7 +1253,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [16384]
-      recipe: "trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -1270,7 +1270,7 @@ dsr1-fp8-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [40]
-      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1283,7 +1283,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1296,7 +1296,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [20]
-      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1309,7 +1309,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [72]
-      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1322,7 +1322,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [144]
-      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -1335,7 +1335,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -1351,7 +1351,7 @@ dsr1-fp8-b300-dynamo-trt:
     osl: 1024
     search-space:
     - conc-list: [64]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1363,7 +1363,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [16]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1375,7 +1375,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [256]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -1387,7 +1387,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [512]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -1399,7 +1399,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [256]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -1411,7 +1411,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [1075]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -1423,7 +1423,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [3072]
-      recipe: "trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -2440,7 +2440,7 @@ dsr1-fp8-h200-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [1]
-      recipe: "trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2453,7 +2453,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4]
-      recipe: "trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2466,7 +2466,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2479,7 +2479,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [16]
-      recipe: "trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2492,7 +2492,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2505,7 +2505,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2518,7 +2518,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2531,7 +2531,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [256]
-      recipe: "trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2544,7 +2544,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2557,7 +2557,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [1]
-      recipe: "trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2569,7 +2569,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4]
-      recipe: "trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2581,7 +2581,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [8]
-      recipe: "trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2593,7 +2593,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [16]
-      recipe: "trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2605,7 +2605,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [32]
-      recipe: "trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2617,7 +2617,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [64]
-      recipe: "trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2629,7 +2629,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [128]
-      recipe: "trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2641,7 +2641,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [256]
-      recipe: "trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2653,7 +2653,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2670,7 +2670,7 @@ dsr1-fp8-h200-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [1]
-      recipe: "trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2683,7 +2683,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4]
-      recipe: "trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2696,7 +2696,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2709,7 +2709,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [16]
-      recipe: "trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2722,7 +2722,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2735,7 +2735,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2748,7 +2748,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2761,7 +2761,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [256]
-      recipe: "trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2774,7 +2774,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2787,7 +2787,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [1]
-      recipe: "trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2799,7 +2799,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4]
-      recipe: "trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2811,7 +2811,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [8]
-      recipe: "trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2823,7 +2823,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [16]
-      recipe: "trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2835,7 +2835,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [32]
-      recipe: "trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2847,7 +2847,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [64]
-      recipe: "trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2859,7 +2859,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [128]
-      recipe: "trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2871,7 +2871,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [256]
-      recipe: "trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -2883,7 +2883,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2911,7 +2911,7 @@ dsr1-fp8-h100-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2924,7 +2924,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2937,7 +2937,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [30]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2950,7 +2950,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [60]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2963,7 +2963,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [117]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2976,7 +2976,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [231]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2989,7 +2989,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [462]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3002,7 +3002,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [615]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3015,7 +3015,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3028,7 +3028,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [6]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3040,7 +3040,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [9]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3052,7 +3052,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [30]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3064,7 +3064,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [60]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3076,7 +3076,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [231]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3088,7 +3088,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [462]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3100,7 +3100,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [924]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3112,7 +3112,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [1845]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3124,7 +3124,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [4916]
-      recipe: "trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 16
@@ -3141,7 +3141,7 @@ dsr1-fp8-h100-dynamo-trt:
     # MTP configurations (6 points)
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3154,7 +3154,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
-      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3167,7 +3167,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [30]
-      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3180,7 +3180,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [77]
-      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3195,7 +3195,7 @@ dsr1-fp8-h100-dynamo-trt:
     # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509
     # - spec-decoding: "mtp"
     #   conc-list: [78]
-    #   recipe: "trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml"
+    #   recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3208,7 +3208,7 @@ dsr1-fp8-h100-dynamo-trt:
     #     dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [154]
-      recipe: "trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 2
         tp: 16
@@ -3221,7 +3221,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     # STP configurations (5 points)
     - conc-list: [6]
-      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3233,7 +3233,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [9]
-      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3245,7 +3245,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [30]
-      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3257,7 +3257,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [154]
-      recipe: "trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3269,7 +3269,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [308]
-      recipe: "trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 16
@@ -3494,7 +3494,7 @@ dsr1-fp8-h100-dynamo-sglang:
     search-space:
     # # STP: Max throughput TEP (1 prefill, 2 decode)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-    #   recipe: "h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3507,7 +3507,7 @@ dsr1-fp8-h100-dynamo-sglang:
     #     dp-attn: false
     # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64]
-    #   recipe: "h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3521,7 +3521,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput TEP (1 prefill, 2 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-      recipe: "h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3535,7 +3535,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64]
-      recipe: "h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3551,7 +3551,7 @@ dsr1-fp8-h100-dynamo-sglang:
     search-space:
     # # STP: Max throughput TEP (1 prefill, 1 decode)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-    #   recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3564,7 +3564,7 @@ dsr1-fp8-h100-dynamo-sglang:
     #     dp-attn: false
     # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64]
-    #   recipe: "h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3578,7 +3578,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput TEP (1 prefill, 1 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-      recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3592,7 +3592,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64]
-      recipe: "h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3687,7 +3687,7 @@ dsr1-fp4-gb200-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [ 180 ]
-      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3700,7 +3700,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 4, 8, 12, 24, 48 ]
-      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3713,7 +3713,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 4301 ]
-      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3726,7 +3726,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 2253 ]
-      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -3739,7 +3739,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 16130 ]
-      recipe: "trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -3754,7 +3754,7 @@ dsr1-fp4-gb200-dynamo-trt:
 
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4301 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3766,7 +3766,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [ 666 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3778,7 +3778,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 6144 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3790,7 +3790,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: true
     - conc-list: [ 12, 24, 48, 96, 192 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3802,7 +3802,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 5 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3814,7 +3814,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 4301 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3826,7 +3826,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3844,7 +3844,7 @@ dsr1-fp4-gb200-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [ 4, 8, 12, 24, 48 ]
-      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3857,7 +3857,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 180 ]
-      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -3870,7 +3870,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 1229 ]
-      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -3883,7 +3883,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 666 ]
-      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 8
         tp: 4
@@ -3896,7 +3896,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 4301 ]
-      recipe: "trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
       prefill:
         num-worker: 11
         tp: 4
@@ -3910,7 +3910,7 @@ dsr1-fp4-gb200-dynamo-trt:
 
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 12, 44, 76 ]
-      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3922,7 +3922,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 5 ]
-      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3934,7 +3934,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 333 ]
-      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3946,7 +3946,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 1229 ]
-      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -3958,7 +3958,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 8
         tp: 4
@@ -3970,7 +3970,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 4096 ]
-      recipe: "trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -3999,7 +3999,7 @@ dsr1-fp8-gb200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [4301]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4012,7 +4012,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2151]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4025,7 +4025,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4038,7 +4038,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [615]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4051,7 +4051,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [36]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4064,7 +4064,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [18]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4077,7 +4077,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
-      recipe: "trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4090,7 +4090,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
   # 1k1k STP configs
     - conc-list: [6144]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4102,7 +4102,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [4301]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4114,7 +4114,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2151]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4126,7 +4126,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [1127]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4138,7 +4138,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [256]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4150,7 +4150,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [27]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4162,7 +4162,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [3]
-      recipe: "trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4179,7 +4179,7 @@ dsr1-fp8-gb200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4192,7 +4192,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4205,7 +4205,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4218,7 +4218,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 4
         tp: 8
@@ -4231,7 +4231,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [90]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4244,7 +4244,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [15]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4257,7 +4257,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4270,7 +4270,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
   # 8k1k STP configs
     - conc-list: [1229]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4282,7 +4282,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [666]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml"
       prefill:
         num-worker: 4
         tp: 8
@@ -4294,7 +4294,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [615]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4306,7 +4306,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [333]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4318,7 +4318,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [63]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4330,7 +4330,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [18]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4342,7 +4342,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [6]
-      recipe: "trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4370,7 +4370,7 @@ dsr1-fp8-gb200-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
     - conc-list: [4, 8]
-      recipe: "gb200-fp8/1k1k/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4384,7 +4384,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
     - conc-list: [1024, 2048, 4096]
-      recipe: "gb200-fp8/1k1k/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4398,7 +4398,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [1024, 2048, 4096, 6144]
-      recipe: "gb200-fp8/1k1k/max-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4412,7 +4412,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8)
     - conc-list: [4096]
-      recipe: "gb200-fp8/1k1k/ultra-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4429,7 +4429,7 @@ dsr1-fp8-gb200-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
     - conc-list: [4, 8, 16]
-      recipe: "gb200-fp8/8k1k/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4443,7 +4443,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [512, 1024, 2048, 6144]
-      recipe: "gb200-fp8/8k1k/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4457,7 +4457,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
     - conc-list: [2048, 4096, 6144]
-      recipe: "gb200-fp8/8k1k/max_tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml"
       prefill:
         num-worker: 6
         tp: 8
@@ -4484,7 +4484,7 @@ dsr1-fp8-gb300-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
     - conc-list: [4, 8, 16, 32]
-      recipe: "gb300-fp8/1k1k/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4498,7 +4498,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [1024, 2048, 4096, 6144]
-      recipe: "gb300-fp8/1k1k/stp/mid.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4512,7 +4512,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
     - conc-list: [4096, 7168, 7680]
-      recipe: "gb300-fp8/1k1k/stp/max.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4529,7 +4529,7 @@ dsr1-fp8-gb300-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
     - conc-list: [4, 8]
-      recipe: "gb300-fp8/8k1k/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4543,7 +4543,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [128, 256, 512, 1024]
-      recipe: "gb300-fp8/8k1k/stp/mid.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4557,7 +4557,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
     - conc-list: [2048, 4096]
-      recipe: "gb300-fp8/8k1k/stp/max.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml"
       prefill:
         num-worker: 6
         tp: 8
@@ -4586,7 +4586,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Low latency (1 prefill node, 2 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32 ]
-      recipe: "gb200-fp4/1k1k/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4601,7 +4601,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Mid curve (4 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
-      recipe: "gb200-fp4/1k1k/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -4616,7 +4616,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Max throughput (4 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048, 4096 ]
-      recipe: "gb200-fp4/1k1k/max-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -4635,7 +4635,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Low latency (1 prefill node, 4 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8 ]
-      recipe: "gb200-fp4/8k1k/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4650,7 +4650,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Mid curve (6 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096 ]
-      recipe: "gb200-fp4/8k1k/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -4665,7 +4665,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Max throughput (10 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048 ]
-      recipe: "gb200-fp4/8k1k/max-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -4693,7 +4693,7 @@ dsr1-fp4-gb300-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [3226]
-      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4706,7 +4706,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4719,7 +4719,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [5]
-      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4732,7 +4732,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8, 12, 24, 48]
-      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4745,7 +4745,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [2253]
-      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4758,7 +4758,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4771,7 +4771,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [5]
-      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4783,7 +4783,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [12, 48, 96, 192]
-      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4795,7 +4795,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [8192]
-      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -4807,7 +4807,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [1229]
-      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -4819,7 +4819,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [4301]
-      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4831,7 +4831,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [2253]
-      recipe: "trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4848,7 +4848,7 @@ dsr1-fp4-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [33]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4861,7 +4861,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [5]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4874,7 +4874,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [12, 24]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4887,7 +4887,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [180]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 4
         tp: 2
@@ -4900,7 +4900,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [308]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 8
         tp: 2
@@ -4913,7 +4913,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2253]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 10
         tp: 2
@@ -4926,7 +4926,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 10
         tp: 2
@@ -4939,7 +4939,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1127]
-      recipe: "trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml"
       prefill:
         num-worker: 13
         tp: 2
@@ -4952,7 +4952,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [72]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4964,7 +4964,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [5]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4976,7 +4976,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [12]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4988,7 +4988,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [5, 15, 30]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -5000,7 +5000,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [666]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 2
@@ -5012,7 +5012,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [1229]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 9
         tp: 2
@@ -5024,7 +5024,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [3228]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 11
         tp: 2
@@ -5036,7 +5036,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 4
         dp-attn: true
     - conc-list: [2253]
-      recipe: "trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 14
         tp: 2
@@ -5065,7 +5065,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Low latency (1 prefill node, 2 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32 ]
-      recipe: "gb300-fp4/1k1k/low_latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5080,7 +5080,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Mid curve (4 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
-      recipe: "gb300-fp4/1k1k/mid_curve.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -5095,7 +5095,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Max throughput (4 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
-      recipe: "gb300-fp4/1k1k/max_tpt.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -5114,7 +5114,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Low latency (1 prefill node, 4 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32, 64 ]
-      recipe: "gb300-fp4/8k1k/low_latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5129,7 +5129,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Mid curve (6 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096 ]
-      recipe: "gb300-fp4/8k1k/mid_curve.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -5144,7 +5144,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Max throughput (10 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048 ]
-      recipe: "gb300-fp4/8k1k/max_tpt.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -5172,7 +5172,7 @@ dsr1-fp8-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5185,7 +5185,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [24]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5198,7 +5198,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [180]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5211,7 +5211,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [564]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5224,7 +5224,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5237,7 +5237,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2253]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5250,7 +5250,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [8192]
-      recipe: "trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -5263,7 +5263,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     # STP configurations (no spec_decoding)
     - conc-list: [4]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5275,7 +5275,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [24]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5287,7 +5287,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [84]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5299,7 +5299,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [1229]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5311,7 +5311,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [2253]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5323,7 +5323,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [8602]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -5335,7 +5335,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [12288]
-      recipe: "trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -5352,7 +5352,7 @@ dsr1-fp8-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5365,7 +5365,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [24]
-      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5378,7 +5378,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -5391,7 +5391,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 8
         tp: 4
@@ -5404,7 +5404,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -5417,7 +5417,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -5430,7 +5430,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     # STP configurations (no spec_decoding)
     - conc-list: [4]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5442,7 +5442,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [24]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5454,7 +5454,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [36]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5466,7 +5466,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [512]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -5478,7 +5478,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [666]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -5490,7 +5490,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [1229]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -5502,7 +5502,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [2151]
-      recipe: "trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -5800,7 +5800,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: Low latency (1 prefill, 9 decode, TEP)
     - spec-decoding: "none"
       conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
-      recipe: "h200/1k1k/low-latency-1p9d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5814,7 +5814,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput TEP (1 prefill, 6 decode)
     - spec-decoding: "none"
       conc-list: [512, 1024, 2048]
-      recipe: "h200/1k1k/bs256-1p6d-tp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5828,7 +5828,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput DEP (1 prefill, 6 decode, dp-attention)
     - spec-decoding: "none"
       conc-list: [128, 256, 512, 1024, 2048]
-      recipe: "h200/1k1k/bs256-1p6d-dep.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5842,7 +5842,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: Low latency (1 prefill, 9 decode, TEP)
     - spec-decoding: "mtp"
       conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
-      recipe: "h200/1k1k/low-latency-1p9d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5856,7 +5856,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput TEP (1 prefill, 6 decode)
     - spec-decoding: "mtp"
       conc-list: [512, 1024, 2048]
-      recipe: "h200/1k1k/bs256-1p6d-tp-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5870,7 +5870,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [128, 256, 512, 1024, 2048]
-      recipe: "h200/1k1k/bs256-1p6d-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5887,7 +5887,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: Low latency TEP (1 prefill, 7 decode)
     - spec-decoding: "none"
       conc-list: [1, 4, 8]
-      recipe: "h200/8k1k/bs4-1p7d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5901,7 +5901,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (1 prefill, 6 decode)
     - spec-decoding: "none"
       conc-list: [4, 8, 16]
-      recipe: "h200/8k1k/bs8-1p6d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5915,7 +5915,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (1 prefill, 3 decode)
     - spec-decoding: "none"
       conc-list: [8, 16, 32]
-      recipe: "h200/8k1k/bs16-1p3d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5929,7 +5929,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (2 prefill, 3 decode)
     - spec-decoding: "none"
       conc-list: [32, 64, 128]
-      recipe: "h200/8k1k/bs64-2p3d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -5943,7 +5943,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "none"
       conc-list: [64, 128, 256]
-      recipe: "h200/8k1k/bs128-1p1d-dep.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5957,7 +5957,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: Low latency TEP (1 prefill, 7 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 4, 8]
-      recipe: "h200/8k1k/bs4-1p7d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5971,7 +5971,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (1 prefill, 6 decode)
     - spec-decoding: "mtp"
       conc-list: [2, 4, 8, 16, 32]
-      recipe: "h200/8k1k/bs8-1p6d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5985,7 +5985,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (1 prefill, 3 decode)
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 32, 64]
-      recipe: "h200/8k1k/bs16-1p3d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5999,7 +5999,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (2 prefill, 3 decode)
     - spec-decoding: "mtp"
       conc-list: [32, 64, 128]
-      recipe: "h200/8k1k/bs64-2p3d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -6013,7 +6013,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [32, 64, 128, 256, 512]
-      recipe: "h200/8k1k/bs128-1p1d-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6040,7 +6040,7 @@ dsr1-fp4-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [16, 128]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6052,7 +6052,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [32, 64, 256]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6064,7 +6064,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [512]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6076,7 +6076,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6092,7 +6092,7 @@ dsr1-fp4-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [64, 128]
-      recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6104,7 +6104,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [8]
-      recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6116,7 +6116,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [4, 128]
-      recipe: "b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[2]"
       prefill:
         num-worker: 2
         tp: 4
@@ -6128,7 +6128,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [4, 8, 16, 64]
-      recipe: "b200-fp4/8k1k.yaml:override_stp_tp4"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_tp4"
       prefill:
         num-worker: 1
         tp: 4
@@ -6140,7 +6140,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 1
         dp-attn: false
     - conc-list: [1024, 2048]
-      recipe: "b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_maxtpt_7p2d"
       prefill:
         num-worker: 7
         tp: 4
@@ -6167,7 +6167,7 @@ dsr1-fp8-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [4]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6179,7 +6179,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [16, 32, 64, 128, 256]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6191,7 +6191,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [1024, 2048, 4096]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6203,7 +6203,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [2048, 4096]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]"
       prefill:
         num-worker: 2
         tp: 8
@@ -6219,7 +6219,7 @@ dsr1-fp8-b200-dynamo-sglang:
     search-space:
     # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat
     - conc-list: [128]
-      recipe: "b200-fp8/8k1k_stp_lowlat_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6231,7 +6231,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "b200-fp8/8k1k_stp_lowlat_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6243,7 +6243,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 1
         dp-attn: false
     - conc-list: [8, 16, 32, 64, 128]
-      recipe: "b200-fp8/8k1k_stp_lowlat_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6256,7 +6256,7 @@ dsr1-fp8-b200-dynamo-sglang:
         dp-attn: false
     # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt
     - conc-list: [288]
-      recipe: "b200-fp8/8k1k_stp_maxtpt_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6268,7 +6268,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [160, 288]
-      recipe: "b200-fp8/8k1k_stp_maxtpt_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6280,7 +6280,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "b200-fp8/8k1k_stp_maxtpt_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -6292,7 +6292,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [1024]
-      recipe: "b200-fp8/8k1k_stp_maxtpt_3.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6320,7 +6320,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: 1P1D
     - spec-decoding: "mtp"
       conc-list: [4, 64]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6334,7 +6334,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: 1P3D
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 32, 128]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6348,7 +6348,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 1P5D
     - spec-decoding: "mtp"
       conc-list: [512, 4096]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6362,7 +6362,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 2P5D
     - spec-decoding: "mtp"
       conc-list: [1024, 2048, 4096]
-      recipe: "b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[2]"
       prefill:
         num-worker: 2
         tp: 8
@@ -6376,7 +6376,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 1P2D
     - spec-decoding: "mtp"
       conc-list: [512, 1024, 2048]
-      recipe: "b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d"
+      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:override_mtp_maxtpt_1p2d"
       prefill:
         num-worker: 1
         tp: 8
@@ -6393,7 +6393,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "b200-fp8/8k1k_mtp_lowlat_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6406,7 +6406,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "b200-fp8/8k1k_mtp_lowlat_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6419,7 +6419,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8, 16, 32, 64, 128]
-      recipe: "b200-fp8/8k1k_mtp_lowlat_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6433,7 +6433,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt
     - spec-decoding: "mtp"
       conc-list: [288]
-      recipe: "b200-fp8/8k1k_mtp_maxtpt_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6446,7 +6446,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [160, 288]
-      recipe: "b200-fp8/8k1k_mtp_maxtpt_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6459,7 +6459,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "b200-fp8/8k1k_mtp_maxtpt_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -6472,7 +6472,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1024]
-      recipe: "b200-fp8/8k1k_mtp_maxtpt_3.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6499,7 +6499,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [16, 512]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6512,7 +6512,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32, 64, 256, 512]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6525,7 +6525,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [512, 1024]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6538,7 +6538,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6557,7 +6557,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [64, 128]
-      recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6570,7 +6570,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6583,7 +6583,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4, 128]
-      recipe: "b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[2]"
       prefill:
         num-worker: 2
         tp: 4
@@ -6596,7 +6596,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 64]
-      recipe: "b200-fp4/8k1k.yaml:override_mtp_tp4"
+      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_mtp_tp4"
       prefill:
         num-worker: 1
         tp: 4
@@ -6623,7 +6623,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
     search-space:
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4, 192, 360, 668 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6635,7 +6635,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 5, 15, 30, 55 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6647,7 +6647,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [ 666 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6659,7 +6659,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6671,7 +6671,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 4301, 6452 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6683,7 +6683,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [ 4301 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -6695,7 +6695,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 4301 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -6712,7 +6712,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
     search-space:
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6724,7 +6724,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 156 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6736,7 +6736,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [ 5, 15, 30, 60, 105 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6748,7 +6748,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [ 333 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -6760,7 +6760,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 615 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -6772,7 +6772,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 2151 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -6784,7 +6784,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -6810,7 +6810,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
     osl: 1024
     search-space:
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
-      recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6822,7 +6822,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 16
         dp-attn: true
     - conc-list: [4, 8, 16, 32, 64, 128]
-      recipe: "vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6837,7 +6837,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
     osl: 1024
     search-space:
     - conc-list: [4, 8, 16, 32, 128]
-      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6849,7 +6849,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 4
         dp-attn: false
     - conc-list: [512, 1024]
-      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -6861,7 +6861,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 16
         dp-attn: true
     - conc-list: [2048]
-      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -6873,7 +6873,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 8
         dp-attn: true
     - conc-list: [3072, 4096]
-      recipe: "vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -6905,7 +6905,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
     # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
     - conc-list: [1, 4, 8, 16, 32, 64]
-      recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6919,7 +6919,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [128, 256, 1024, 2048, 4096]
-      recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6934,7 +6934,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # The 4096 overlap with the 1p1d block gives a crossover point. 8192
     # would saturate 1p1d's prefill, so this topology takes over there.
     - conc-list: [4096, 8192]
-      recipe: "vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6952,7 +6952,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
     # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
     - conc-list: [1, 4, 8, 16, 32, 64]
-      recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6965,7 +6965,7 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: false
     # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
     - conc-list: [512, 1024]
-      recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6979,7 +6979,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
     # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
     - conc-list: [4096, 8192]
-      recipe: "vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 7
         tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp4/1k1k.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp4/8k1k.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/1k1k.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_lowlat_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/max-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/1k1k/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/max-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp4/8k1k/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/max-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/1k1k/ultra-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/max_tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb200-fp8/8k1k/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/low_latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/max_tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/1k1k/mid_curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/low_latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/max_tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp4/8k1k/mid_curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/max.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/1k1k/stp/mid.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/max.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/gb300-fp8/8k1k/stp/mid.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/bs256-1p6d-tp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/1k1k/low-latency-1p9d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs128-1p1d-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs16-1p3d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs4-1p7d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs64-2p3d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/h200/8k1k/bs8-1p6d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml

From aa430b5baf5314f6dcc4829e190fceb299c074b3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 13:57:14 -0500
Subject: [PATCH 05/16] srt-slurm: collapse split <isl>/<osl>/ recipe dirs into
 <isl><osl>/

Per request, drop the awkward `1k/1k/` two-segment intermediate in the
recipe tree in favor of `1k1k/`. New shape:

  <model>/<framework>/<hw>-<precision>/<isl><osl>/<agg|disagg>/<stp|mtp>/<recipe>.yaml

370 files renamed, 393 recipe references in nvidia-master.yaml rewritten,
schema validation + tests still green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/CONFIGS.md                    |   4 +-
 .github/configs/nvidia-master.yaml            | 786 +++++++++---------
 .../b200-fp4/{1k/1k => 1k1k}/disagg/1k1k.yaml |   0
 .../b200-fp4/{8k/1k => 8k1k}/disagg/8k1k.yaml |   0
 .../b200-fp8/{1k/1k => 1k1k}/disagg/1k1k.yaml |   0
 .../disagg/mtp/8k1k_mtp_lowlat_0.yaml         |   0
 .../disagg/mtp/8k1k_mtp_lowlat_1.yaml         |   0
 .../disagg/mtp/8k1k_mtp_lowlat_2.yaml         |   0
 .../disagg/mtp/8k1k_mtp_maxtpt_0.yaml         |   0
 .../disagg/mtp/8k1k_mtp_maxtpt_1.yaml         |   0
 .../disagg/mtp/8k1k_mtp_maxtpt_2.yaml         |   0
 .../disagg/mtp/8k1k_mtp_maxtpt_3.yaml         |   0
 .../disagg/stp/8k1k_stp_lowlat_0.yaml         |   0
 .../disagg/stp/8k1k_stp_lowlat_1.yaml         |   0
 .../disagg/stp/8k1k_stp_lowlat_2.yaml         |   0
 .../disagg/stp/8k1k_stp_maxtpt_0.yaml         |   0
 .../disagg/stp/8k1k_stp_maxtpt_1.yaml         |   0
 .../disagg/stp/8k1k_stp_maxtpt_2.yaml         |   0
 .../disagg/stp/8k1k_stp_maxtpt_3.yaml         |   0
 .../1k => 1k1k}/disagg/stp/low-latency.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml |   0
 .../1k => 8k1k}/disagg/stp/low-latency.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/max-tpt.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml |   0
 .../1k => 1k1k}/disagg/stp/low-latency.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml |   0
 .../{1k/1k => 1k1k}/disagg/stp/ultra-tpt.yaml |   0
 .../1k => 8k1k}/disagg/stp/low-latency.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml |   0
 .../1k => 1k1k}/disagg/stp/low_latency.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/max_tpt.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/mid_curve.yaml |   0
 .../1k => 8k1k}/disagg/stp/low_latency.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/mid_curve.yaml |   0
 .../1k => 1k1k}/disagg/stp/low-latency.yaml   |   0
 .../{1k/1k => 1k1k}/disagg/stp/max.yaml       |   0
 .../{1k/1k => 1k1k}/disagg/stp/mid.yaml       |   0
 .../1k => 8k1k}/disagg/stp/low-latency.yaml   |   0
 .../{8k/1k => 8k1k}/disagg/stp/max.yaml       |   0
 .../{8k/1k => 8k1k}/disagg/stp/mid.yaml       |   0
 .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml |   0
 .../disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml  |   0
 .../disagg/stp/h100-fp8-1p1d-max-dep.yaml     |   0
 .../disagg/stp/h100-fp8-1p2d-max-tp.yaml      |   0
 .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml |   0
 .../disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml  |   0
 .../disagg/stp/h100-fp8-1p1d-max-dep.yaml     |   0
 .../disagg/stp/h100-fp8-1p1d-max-tp.yaml      |   0
 .../disagg/mtp/bs256-1p6d-dep-mtp.yaml        |   0
 .../disagg/mtp/bs256-1p6d-tp-mtp.yaml         |   0
 .../disagg/mtp/low-latency-1p9d-mtp.yaml      |   0
 .../disagg/stp/bs256-1p6d-dep.yaml            |   0
 .../1k => 1k1k}/disagg/stp/bs256-1p6d-tp.yaml |   0
 .../disagg/stp/low-latency-1p9d.yaml          |   0
 .../disagg/mtp/bs128-1p1d-dep-mtp.yaml        |   0
 .../1k => 8k1k}/disagg/mtp/bs16-1p3d-mtp.yaml |   0
 .../1k => 8k1k}/disagg/mtp/bs4-1p7d-mtp.yaml  |   0
 .../1k => 8k1k}/disagg/mtp/bs64-2p3d-mtp.yaml |   0
 .../1k => 8k1k}/disagg/mtp/bs8-1p6d-mtp.yaml  |   0
 .../disagg/stp/bs128-1p1d-dep.yaml            |   0
 .../{8k/1k => 8k1k}/disagg/stp/bs16-1p3d.yaml |   0
 .../{8k/1k => 8k1k}/disagg/stp/bs4-1p7d.yaml  |   0
 .../{8k/1k => 8k1k}/disagg/stp/bs64-2p3d.yaml |   0
 .../{8k/1k => 8k1k}/disagg/stp/bs8-1p6d.yaml  |   0
 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml    |   0
 .../ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |   0
 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml    |   0
 .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml   |   0
 .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml   |   0
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml   |   0
 .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml   |   0
 .../ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml    |   0
 .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml |   0
 .../ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml    |   0
 .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml   |   0
 .../ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml    |   0
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml |   0
 .../ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml    |   0
 .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml   |   0
 .../ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml   |   0
 ...x1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml |   0
 ...x1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml |   0
 ...x1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml |   0
 ...tx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml |   0
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml    |   0
 .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml |   0
 .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml   |   0
 .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml   |   0
 ...x1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml |   0
 ...tx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml |   0
 ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml |   0
 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml |   0
 ...tx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml |   0
 ...x2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml |   0
 .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml   |   0
 .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml  |   0
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml   |   0
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml    |   0
 ...ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml |   0
 .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml |   0
 ...x4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml |   0
 ...tx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml |   0
 ...tx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml |   0
 .../ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml    |   0
 ...ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml |   0
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml |   0
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml  |   0
 .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml  |   0
 ...tx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml |   0
 .../ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml    |   0
 .../ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml |   0
 .../ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml    |   0
 .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml   |   0
 .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml   |   0
 .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml   |   0
 .../ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml |   0
 .../ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml   |   0
 .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml  |   0
 .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml   |   0
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml  |   0
 .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml |   0
 .../ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml    |   0
 .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml   |   0
 .../ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml |   0
 .../ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml   |   0
 .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml   |   0
 ...tx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml |   0
 ...x1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml |   0
 ...ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml |   0
 .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml |   0
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml   |   0
 ...x3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml |   0
 ...x1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml |   0
 ...tx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml |   0
 ...ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml |   0
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml  |   0
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml |   0
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml |   0
 ...2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml |   0
 .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml   |   0
 .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml  |   0
 .../ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml    |   0
 .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml   |   0
 .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml |   0
 .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml |   0
 .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml  |   0
 .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml   |   0
 .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml |   0
 .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml |   0
 .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml |   0
 ...tx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml |   0
 ...x7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml |   0
 .../ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml    |   0
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |   0
 ...ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml |   0
 .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml |   0
 .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml   |   0
 .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml   |   0
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml   |   0
 .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml   |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml    |   0
 ...ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml |   0
 .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml   |   0
 ...tx11_gen1_dep16_batch256_eplb256_mtp1.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |   0
 .../ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml    |   0
 .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml |   0
 .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml   |   0
 ...tx10_gen1_dep16_batch256_eplb256_mtp0.yaml |   0
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml    |   0
 .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml   |   0
 .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml  |   0
 ...x1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml |   0
 ...tx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml |   0
 ...x1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml |   0
 ...x1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml |   0
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml   |   0
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml  |   0
 .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml  |   0
 ...1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml |   0
 ...x1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml |   0
 ...ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml |   0
 ...x1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml |   0
 ...x1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml |   0
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml   |   0
 .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml  |   0
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml   |   0
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml  |   0
 .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml |   0
 ...tx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml |   0
 ...ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml |   0
 ...ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml |   0
 ...tx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml |   0
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml   |   0
 .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml  |   0
 ...ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml |   0
 ...tx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml |   0
 ...tx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml |   0
 ...x5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml |   0
 .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml |   0
 .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml   |   0
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml |   0
 ...ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml |   0
 .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml   |   0
 .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml  |   0
 ...ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml |   0
 .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml   |   0
 .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml  |   0
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml  |   0
 ...ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml |   0
 .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml |   0
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml |   0
 .../ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml    |   0
 .../ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml    |   0
 .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml  |   0
 .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml |   0
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml    |   0
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml |   0
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml |   0
 .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml   |   0
 .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml   |   0
 ...tx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml |   0
 ...ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml  |   0
 ...2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml |   0
 ...tx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml |   0
 ...x3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml |   0
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml  |   0
 ...2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml |   0
 ...x2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml |   0
 ...x3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml |   0
 ...3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml |   0
 ...10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml  |   0
 ...ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml |   0
 ...x7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml |   0
 ...tx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml |   0
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml   |   0
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml  |   0
 .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml  |   0
 ...tx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml |   0
 ...tx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml |   0
 ...x7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml |   0
 ...x7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml |   0
 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml   |   0
 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml   |   0
 .../ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml  |   0
 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml   |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml  |   0
 .../ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml    |   0
 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml    |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml    |   0
 .../ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml    |   0
 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml   |   0
 .../ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml    |   0
 .../ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml    |   0
 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml   |   0
 ...28_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml |   0
 ...16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml |   0
 .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml |   0
 ...56_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml |   0
 ...2_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |   0
 ...4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |   0
 ...12_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml |   0
 ...64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml |   0
 ...8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml |   0
 ...28_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml |   0
 ...16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml  |   0
 ...56_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml |   0
 ...32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...12_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml |   0
 ...64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml |   0
 ...128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml |   0
 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml |   0
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml  |   0
 ...256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml |   0
 ...c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml |   0
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml |   0
 ...512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml |   0
 ...c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml |   0
 .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml |   0
 ...28_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml |   0
 ...c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml |   0
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml  |   0
 ...56_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml |   0
 ...32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml |   0
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml |   0
 ...12_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml |   0
 ...64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml |   0
 .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml |   0
 .../stp/disagg-gb200-1p1d-dep8-dep16.yaml     |   0
 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml      |   0
 .../stp/disagg-gb200-3p1d-dep8-dep16.yaml     |   0
 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml      |   0
 .../stp/disagg-gb200-3p1d-dep8-dep16.yaml     |   0
 .../stp/disagg-gb200-7p1d-dep8-dep16.yaml     |   0
 ...ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml |   0
 ...ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml |   0
 ..._gen1dep8_batch768_allconc_eplb0_mtp0.yaml |   0
 ..._gen4tep8_batch128_allconc_eplb0_mtp0.yaml |   0
 ...p4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml |   0
 ...tx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml |   0
 ...tx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml |   0
 ...4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml |   0
 ...p4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml |   0
 ...4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml |   0
 ...ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml |   0
 ...ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml |   0
 ..._gen1dep8_batch256_allconc_eplb0_mtp0.yaml |   0
 ...tx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml |   0
 .../stp/disagg-gb200-1p1d-dep4-dep16.yaml     |   0
 .../stp/disagg-gb200-1p4d-dep4-tep4.yaml      |   0
 .../stp/disagg-gb200-1p4d-dep4-tep4.yaml      |   0
 .../stp/disagg-gb200-3p1d-dep4-dep16.yaml     |   0
 .../stp/disagg-gb200-5p1d-dep4-dep8.yaml      |   0
 .../stp/disagg-gb200-6p1d-dep4-dep16.yaml     |   0
 372 files changed, 395 insertions(+), 395 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/{1k/1k => 1k1k}/disagg/1k1k.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/{8k/1k => 8k1k}/disagg/8k1k.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{1k/1k => 1k1k}/disagg/1k1k.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_lowlat_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_lowlat_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_lowlat_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/8k1k_mtp_maxtpt_3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_lowlat_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_lowlat_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_lowlat_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/{8k/1k => 8k1k}/disagg/stp/8k1k_stp_maxtpt_3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/max-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/max-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ultra-tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/mid-curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/low_latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/max_tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/mid_curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/low_latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/max_tpt.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/mid_curve.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/max.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/mid.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/low-latency.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/max.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/mid.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/stp/h100-fp8-1p1d-max-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{1k/1k => 1k1k}/disagg/stp/h100-fp8-1p2d-max-tp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/stp/h100-fp8-1p1d-max-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/{8k/1k => 8k1k}/disagg/stp/h100-fp8-1p1d-max-tp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/bs256-1p6d-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/bs256-1p6d-tp-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/low-latency-1p9d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/stp/bs256-1p6d-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/stp/bs256-1p6d-tp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{1k/1k => 1k1k}/disagg/stp/low-latency-1p9d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs128-1p1d-dep-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs16-1p3d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs4-1p7d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs64-2p3d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/bs8-1p6d-mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs128-1p1d-dep.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs16-1p3d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs4-1p7d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs64-2p3d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/{8k/1k => 8k1k}/disagg/stp/bs8-1p6d.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/{8k/1k => 8k1k}/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{1k/1k => 1k1k}/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/{8k/1k => 8k1k}/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{1k/1k => 1k1k}/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/{8k/1k => 8k1k}/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{1k/1k => 1k1k}/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/{8k/1k => 8k1k}/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{1k/1k => 1k1k}/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/{8k/1k => 8k1k}/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{1k/1k => 1k1k}/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/{8k/1k => 8k1k}/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml (100%)

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
index 482c9acfc..46755ef31 100644
--- a/.github/configs/CONFIGS.md
+++ b/.github/configs/CONFIGS.md
@@ -69,9 +69,9 @@ search-space:
 ```
 
 - `recipe` is a path **relative to `benchmarks/multi_node/srt-slurm-recipes/`** in this repo. The schema validator rejects entries whose recipe file does not exist on disk, so adding a new entry requires upstreaming the recipe yaml here first.
-- The path may carry an `:override[N]` / `:override_<name>` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`.
+- The path may carry an `:override[N]` / `:override_<name>` suffix to select a named override section inside an sglang-style recipe yaml (e.g. `"dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"`). The launcher strips this suffix before reading the file but passes the full string to `srtctl`.
 - `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset.
-- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `<model>/<framework>/<hw>-<precision>/<isl>/<osl>/<agg|disagg>/<stp|mtp>/<recipe-name>.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `<stp|mtp>/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form.
+- Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `<model>/<framework>/<hw>-<precision>/<isl><osl>/<agg|disagg>/<stp|mtp>/<recipe-name>.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `<stp|mtp>/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form.
 
 ## Runners
 
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index bb59f1dd0..9ff2a96aa 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13,7 +13,7 @@ dsr1-fp4-b200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [1214]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -26,7 +26,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [875]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -39,7 +39,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -52,7 +52,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 15, 25, 45, 90, 180]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -65,7 +65,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 4968 ]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -78,7 +78,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [10860]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -92,7 +92,7 @@ dsr1-fp4-b200-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [4096]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -104,7 +104,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2192]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -116,7 +116,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [1365]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -128,7 +128,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [6]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -140,7 +140,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [10, 15, 25, 45, 90, 180]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -152,7 +152,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [450]
-      recipe: "dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -169,7 +169,7 @@ dsr1-fp4-b200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [90]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -182,7 +182,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [66]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -195,7 +195,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -208,7 +208,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 15, 30, 60]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -221,7 +221,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [548]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -234,7 +234,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1096, 1691]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -247,7 +247,7 @@ dsr1-fp4-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [658]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -261,7 +261,7 @@ dsr1-fp4-b200-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [6]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -273,7 +273,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [10, 15, 25, 50, 100]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -285,7 +285,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [370]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -297,7 +297,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [1606]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -309,7 +309,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [837]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -321,7 +321,7 @@ dsr1-fp4-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2222]
-      recipe: "dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -349,7 +349,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - Low latency (TP attention)
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -362,7 +362,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -375,7 +375,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -388,7 +388,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [256]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -402,7 +402,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - High throughput (DP attention)
     - spec-decoding: "mtp"
       conc-list: [896]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -415,7 +415,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1024]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -428,7 +428,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1184]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -441,7 +441,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1600]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -455,7 +455,7 @@ dsr1-fp8-b200-dynamo-trt:
 
     # Non-MTP (STP) configurations - Low latency (TP attention)
     - conc-list: [4]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -467,7 +467,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [32]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -479,7 +479,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -492,7 +492,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     # Non-MTP (STP) configurations - High throughput (DP attention)
     - conc-list: [1920]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -504,7 +504,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [4096]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -516,7 +516,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [5152]
-      recipe: "dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -534,7 +534,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - Low latency (TP attention)
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -547,7 +547,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -560,7 +560,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [48]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -573,7 +573,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -587,7 +587,7 @@ dsr1-fp8-b200-dynamo-trt:
     # MTP configurations - High throughput (DP attention)
     - spec-decoding: "mtp"
       conc-list: [224]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -600,7 +600,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [288]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -613,7 +613,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1088]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml"
       prefill:
         num-worker: 4
         tp: 8
@@ -627,7 +627,7 @@ dsr1-fp8-b200-dynamo-trt:
 
     # Non-MTP (STP) configurations - Low latency (TP attention)
     - conc-list: [1]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -639,7 +639,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [32]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -651,7 +651,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -663,7 +663,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [96]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -676,7 +676,7 @@ dsr1-fp8-b200-dynamo-trt:
         dp-attn: false
     # Non-MTP (STP) configurations - High throughput (DP attention)
     - conc-list: [128]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -688,7 +688,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [128]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -700,7 +700,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [256]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -712,7 +712,7 @@ dsr1-fp8-b200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [640]
-      recipe: "dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml"
+      recipe: "dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -740,7 +740,7 @@ dsr1-fp4-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [654]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -753,7 +753,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [271]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -766,7 +766,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [11]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -779,7 +779,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [10, 20, 25, 60, 120, 200]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -792,7 +792,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [2342]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -805,7 +805,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [8609]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 2
@@ -818,7 +818,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [12926]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 5
         tp: 2
@@ -832,7 +832,7 @@ dsr1-fp4-b300-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [1176]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -844,7 +844,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [6]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -856,7 +856,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [5, 10, 15, 25]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -868,7 +868,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [60, 110, 195, 395]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -880,7 +880,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4405]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -892,7 +892,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [8192]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -904,7 +904,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [4611]
-      recipe: "dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -921,7 +921,7 @@ dsr1-fp4-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [2198]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 10
         tp: 2
@@ -934,7 +934,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [52]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -947,7 +947,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -960,7 +960,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -973,7 +973,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [181]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -986,7 +986,7 @@ dsr1-fp4-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1197]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml"
       prefill:
         num-worker: 9
         tp: 2
@@ -1000,7 +1000,7 @@ dsr1-fp4-b300-dynamo-trt:
 
     # Non-MTP configurations
     - conc-list: [105]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1012,7 +1012,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [63]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1024,7 +1024,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1036,7 +1036,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [12]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -1048,7 +1048,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [589]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 2
@@ -1060,7 +1060,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [1093]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 6
         tp: 2
@@ -1072,7 +1072,7 @@ dsr1-fp4-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2048]
-      recipe: "dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 8
         tp: 2
@@ -1100,7 +1100,7 @@ dsr1-fp8-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [10]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1113,7 +1113,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [160]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1126,7 +1126,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [3072]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1139,7 +1139,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2560]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1152,7 +1152,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [720]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1165,7 +1165,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [11264]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -1181,7 +1181,7 @@ dsr1-fp8-b300-dynamo-trt:
     osl: 1024
     search-space:
     - conc-list: [2112]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1193,7 +1193,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [3072]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1205,7 +1205,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [1280]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1217,7 +1217,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [12]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1229,7 +1229,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1241,7 +1241,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [384]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1253,7 +1253,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [16384]
-      recipe: "dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -1270,7 +1270,7 @@ dsr1-fp8-b300-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [40]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1283,7 +1283,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1296,7 +1296,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [20]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1309,7 +1309,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [72]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1322,7 +1322,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [144]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -1335,7 +1335,7 @@ dsr1-fp8-b300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -1351,7 +1351,7 @@ dsr1-fp8-b300-dynamo-trt:
     osl: 1024
     search-space:
     - conc-list: [64]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1363,7 +1363,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [16]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -1375,7 +1375,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [256]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -1387,7 +1387,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [512]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -1399,7 +1399,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [256]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -1411,7 +1411,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: false
     - conc-list: [1075]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -1423,7 +1423,7 @@ dsr1-fp8-b300-dynamo-trt:
         ep: 1
         dp-attn: true
     - conc-list: [3072]
-      recipe: "dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml"
+      recipe: "dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -2440,7 +2440,7 @@ dsr1-fp8-h200-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [1]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2453,7 +2453,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2466,7 +2466,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2479,7 +2479,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [16]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2492,7 +2492,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2505,7 +2505,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2518,7 +2518,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2531,7 +2531,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [256]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2544,7 +2544,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2557,7 +2557,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [1]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2569,7 +2569,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2581,7 +2581,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [8]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2593,7 +2593,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [16]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2605,7 +2605,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [32]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2617,7 +2617,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [64]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2629,7 +2629,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [128]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2641,7 +2641,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [256]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2653,7 +2653,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2670,7 +2670,7 @@ dsr1-fp8-h200-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [1]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2683,7 +2683,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2696,7 +2696,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2709,7 +2709,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [16]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2722,7 +2722,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2735,7 +2735,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [64]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2748,7 +2748,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2761,7 +2761,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [256]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2774,7 +2774,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2787,7 +2787,7 @@ dsr1-fp8-h200-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [1]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2799,7 +2799,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [4]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2811,7 +2811,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [8]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2823,7 +2823,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [16]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2835,7 +2835,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [32]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2847,7 +2847,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [64]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -2859,7 +2859,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [128]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -2871,7 +2871,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [256]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -2883,7 +2883,7 @@ dsr1-fp8-h200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -2911,7 +2911,7 @@ dsr1-fp8-h100-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2924,7 +2924,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2937,7 +2937,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [30]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2950,7 +2950,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [60]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2963,7 +2963,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [117]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2976,7 +2976,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [231]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -2989,7 +2989,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [462]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3002,7 +3002,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [615]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3015,7 +3015,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3028,7 +3028,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (STP)
     - conc-list: [6]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3040,7 +3040,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [9]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3052,7 +3052,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [30]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3064,7 +3064,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [60]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3076,7 +3076,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [231]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3088,7 +3088,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [462]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3100,7 +3100,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [924]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3112,7 +3112,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [1845]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3124,7 +3124,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [4916]
-      recipe: "dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 16
@@ -3141,7 +3141,7 @@ dsr1-fp8-h100-dynamo-trt:
     # MTP configurations (6 points)
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3154,7 +3154,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3167,7 +3167,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [30]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3180,7 +3180,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [77]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3195,7 +3195,7 @@ dsr1-fp8-h100-dynamo-trt:
     # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509
     # - spec-decoding: "mtp"
     #   conc-list: [78]
-    #   recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml"
+    #   recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3208,7 +3208,7 @@ dsr1-fp8-h100-dynamo-trt:
     #     dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [154]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 2
         tp: 16
@@ -3221,7 +3221,7 @@ dsr1-fp8-h100-dynamo-trt:
         dp-attn: true
     # STP configurations (5 points)
     - conc-list: [6]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3233,7 +3233,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [9]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3245,7 +3245,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [30]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3257,7 +3257,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [154]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3269,7 +3269,7 @@ dsr1-fp8-h100-dynamo-trt:
         ep: 16
         dp-attn: false
     - conc-list: [308]
-      recipe: "dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 16
@@ -3494,7 +3494,7 @@ dsr1-fp8-h100-dynamo-sglang:
     search-space:
     # # STP: Max throughput TEP (1 prefill, 2 decode)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-    #   recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3507,7 +3507,7 @@ dsr1-fp8-h100-dynamo-sglang:
     #     dp-attn: false
     # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64]
-    #   recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3521,7 +3521,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput TEP (1 prefill, 2 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-      recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3535,7 +3535,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64]
-      recipe: "dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3551,7 +3551,7 @@ dsr1-fp8-h100-dynamo-sglang:
     search-space:
     # # STP: Max throughput TEP (1 prefill, 1 decode)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-    #   recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3564,7 +3564,7 @@ dsr1-fp8-h100-dynamo-sglang:
     #     dp-attn: false
     # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     # - conc-list: [1, 2, 4, 8, 16, 32, 64]
-    #   recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml"
+    #   recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml"
     #   prefill:
     #     num-worker: 1
     #     tp: 16
@@ -3578,7 +3578,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput TEP (1 prefill, 1 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64, 128]
-      recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3592,7 +3592,7 @@ dsr1-fp8-h100-dynamo-sglang:
     # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [1, 2, 4, 8, 16, 32, 64]
-      recipe: "dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 16
@@ -3687,7 +3687,7 @@ dsr1-fp4-gb200-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [ 180 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3700,7 +3700,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 4, 8, 12, 24, 48 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3713,7 +3713,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 4301 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3726,7 +3726,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 2253 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -3739,7 +3739,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 16130 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -3754,7 +3754,7 @@ dsr1-fp4-gb200-dynamo-trt:
 
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4301 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3766,7 +3766,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [ 666 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3778,7 +3778,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 6144 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3790,7 +3790,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: true
     - conc-list: [ 12, 24, 48, 96, 192 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3802,7 +3802,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 5 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3814,7 +3814,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 4301 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3826,7 +3826,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3844,7 +3844,7 @@ dsr1-fp4-gb200-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [ 4, 8, 12, 24, 48 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3857,7 +3857,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [ 180 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -3870,7 +3870,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 1229 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -3883,7 +3883,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 666 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml"
       prefill:
         num-worker: 8
         tp: 4
@@ -3896,7 +3896,7 @@ dsr1-fp4-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [ 4301 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml"
       prefill:
         num-worker: 11
         tp: 4
@@ -3910,7 +3910,7 @@ dsr1-fp4-gb200-dynamo-trt:
 
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 12, 44, 76 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3922,7 +3922,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 5 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -3934,7 +3934,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 333 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -3946,7 +3946,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 1229 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -3958,7 +3958,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 8
         tp: 4
@@ -3970,7 +3970,7 @@ dsr1-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 4096 ]
-      recipe: "dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -3999,7 +3999,7 @@ dsr1-fp8-gb200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [4301]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4012,7 +4012,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2151]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4025,7 +4025,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4038,7 +4038,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [615]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4051,7 +4051,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [36]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4064,7 +4064,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [18]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4077,7 +4077,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [9]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4090,7 +4090,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
   # 1k1k STP configs
     - conc-list: [6144]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4102,7 +4102,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [4301]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4114,7 +4114,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [2151]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4126,7 +4126,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [1127]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4138,7 +4138,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [256]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4150,7 +4150,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [27]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4162,7 +4162,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [3]
-      recipe: "dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4179,7 +4179,7 @@ dsr1-fp8-gb200-dynamo-trt:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4192,7 +4192,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4205,7 +4205,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4218,7 +4218,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 4
         tp: 8
@@ -4231,7 +4231,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [90]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4244,7 +4244,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [15]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4257,7 +4257,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [6]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4270,7 +4270,7 @@ dsr1-fp8-gb200-dynamo-trt:
         dp-attn: false
   # 8k1k STP configs
     - conc-list: [1229]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4282,7 +4282,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [666]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml"
       prefill:
         num-worker: 4
         tp: 8
@@ -4294,7 +4294,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [615]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4306,7 +4306,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [333]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4318,7 +4318,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [63]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4330,7 +4330,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [18]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4342,7 +4342,7 @@ dsr1-fp8-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [6]
-      recipe: "dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml"
+      recipe: "dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4370,7 +4370,7 @@ dsr1-fp8-gb200-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
     - conc-list: [4, 8]
-      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4384,7 +4384,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48)
     - conc-list: [1024, 2048, 4096]
-      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -4398,7 +4398,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [1024, 2048, 4096, 6144]
-      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4412,7 +4412,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8)
     - conc-list: [4096]
-      recipe: "dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4429,7 +4429,7 @@ dsr1-fp8-gb200-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8)
     - conc-list: [4, 8, 16]
-      recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4443,7 +4443,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [512, 1024, 2048, 6144]
-      recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4457,7 +4457,7 @@ dsr1-fp8-gb200-dynamo-sglang:
 
     # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
     - conc-list: [2048, 4096, 6144]
-      recipe: "dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml"
       prefill:
         num-worker: 6
         tp: 8
@@ -4484,7 +4484,7 @@ dsr1-fp8-gb300-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4)
     - conc-list: [4, 8, 16, 32]
-      recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4498,7 +4498,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [1024, 2048, 4096, 6144]
-      recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -4512,7 +4512,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8)
     - conc-list: [4096, 7168, 7680]
-      recipe: "dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -4529,7 +4529,7 @@ dsr1-fp8-gb300-dynamo-sglang:
     search-space:
    # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4)
     - conc-list: [4, 8]
-      recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4543,7 +4543,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32)
     - conc-list: [128, 256, 512, 1024]
-      recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml"
       prefill:
         num-worker: 5
         tp: 8
@@ -4557,7 +4557,7 @@ dsr1-fp8-gb300-dynamo-sglang:
 
     # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24)
     - conc-list: [2048, 4096]
-      recipe: "dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml"
+      recipe: "dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml"
       prefill:
         num-worker: 6
         tp: 8
@@ -4586,7 +4586,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Low latency (1 prefill node, 2 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32 ]
-      recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4601,7 +4601,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Mid curve (4 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
-      recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -4616,7 +4616,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Max throughput (4 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048, 4096 ]
-      recipe: "dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -4635,7 +4635,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Low latency (1 prefill node, 4 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8 ]
-      recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -4650,7 +4650,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Mid curve (6 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096 ]
-      recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -4665,7 +4665,7 @@ dsr1-fp4-gb200-dynamo-sglang:
     # Max throughput (10 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048 ]
-      recipe: "dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml"
+      recipe: "dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -4693,7 +4693,7 @@ dsr1-fp4-gb300-dynamo-trt:
     # MTP configurations
     - spec-decoding: "mtp"
       conc-list: [3226]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4706,7 +4706,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4719,7 +4719,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [5]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4732,7 +4732,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8, 12, 24, 48]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4745,7 +4745,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [2253]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4758,7 +4758,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4771,7 +4771,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [5]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4783,7 +4783,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [12, 48, 96, 192]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4795,7 +4795,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [8192]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -4807,7 +4807,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 2
@@ -4819,7 +4819,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [4301]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4831,7 +4831,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [2253]
-      recipe: "dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 2
@@ -4848,7 +4848,7 @@ dsr1-fp4-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [33]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4861,7 +4861,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [5]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4874,7 +4874,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [12, 24]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4887,7 +4887,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [180]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml"
       prefill:
         num-worker: 4
         tp: 2
@@ -4900,7 +4900,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [308]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml"
       prefill:
         num-worker: 8
         tp: 2
@@ -4913,7 +4913,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2253]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml"
       prefill:
         num-worker: 10
         tp: 2
@@ -4926,7 +4926,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml"
       prefill:
         num-worker: 10
         tp: 2
@@ -4939,7 +4939,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1127]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml"
       prefill:
         num-worker: 13
         tp: 2
@@ -4952,7 +4952,7 @@ dsr1-fp4-gb300-dynamo-trt:
         dp-attn: true
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [72]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4964,7 +4964,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [5]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4976,7 +4976,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [12]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -4988,7 +4988,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [5, 15, 30]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 2
@@ -5000,7 +5000,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [666]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 2
@@ -5012,7 +5012,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 9
         tp: 2
@@ -5024,7 +5024,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [3228]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 11
         tp: 2
@@ -5036,7 +5036,7 @@ dsr1-fp4-gb300-dynamo-trt:
         ep: 4
         dp-attn: true
     - conc-list: [2253]
-      recipe: "dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml"
+      recipe: "dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 14
         tp: 2
@@ -5065,7 +5065,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Low latency (1 prefill node, 2 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32 ]
-      recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5080,7 +5080,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Mid curve (4 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
-      recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -5095,7 +5095,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Max throughput (4 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096, 8192 ]
-      recipe: "dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -5114,7 +5114,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Low latency (1 prefill node, 4 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 4, 8, 32, 64 ]
-      recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5129,7 +5129,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Mid curve (6 prefill nodes, 12 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 512, 2048, 4096 ]
-      recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -5144,7 +5144,7 @@ dsr1-fp4-gb300-dynamo-sglang:
     # Max throughput (10 prefill nodes, 8 decode nodes)
     - spec-decoding: "none"
       conc-list: [ 2048 ]
-      recipe: "dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml"
+      recipe: "dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -5172,7 +5172,7 @@ dsr1-fp8-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5185,7 +5185,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [24]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5198,7 +5198,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [180]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5211,7 +5211,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [564]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5224,7 +5224,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5237,7 +5237,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [2253]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5250,7 +5250,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [8192]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -5263,7 +5263,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     # STP configurations (no spec_decoding)
     - conc-list: [4]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5275,7 +5275,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [24]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5287,7 +5287,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [84]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5299,7 +5299,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5311,7 +5311,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [2253]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -5323,7 +5323,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [8602]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -5335,7 +5335,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [12288]
-      recipe: "dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -5352,7 +5352,7 @@ dsr1-fp8-gb300-dynamo-trt:
     # MTP configurations (spec_decoding="mtp")
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5365,7 +5365,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [24]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5378,7 +5378,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [333]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -5391,7 +5391,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [666]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml"
       prefill:
         num-worker: 8
         tp: 4
@@ -5404,7 +5404,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 10
         tp: 4
@@ -5417,7 +5417,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -5430,7 +5430,7 @@ dsr1-fp8-gb300-dynamo-trt:
         dp-attn: true
     # STP configurations (no spec_decoding)
     - conc-list: [4]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5442,7 +5442,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [24]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5454,7 +5454,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [36]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -5466,7 +5466,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [512]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -5478,7 +5478,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [666]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml"
       prefill:
         num-worker: 4
         tp: 4
@@ -5490,7 +5490,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [1229]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -5502,7 +5502,7 @@ dsr1-fp8-gb300-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [2151]
-      recipe: "dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml"
+      recipe: "dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -5800,7 +5800,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: Low latency (1 prefill, 9 decode, TEP)
     - spec-decoding: "none"
       conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
-      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5814,7 +5814,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput TEP (1 prefill, 6 decode)
     - spec-decoding: "none"
       conc-list: [512, 1024, 2048]
-      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5828,7 +5828,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput DEP (1 prefill, 6 decode, dp-attention)
     - spec-decoding: "none"
       conc-list: [128, 256, 512, 1024, 2048]
-      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5842,7 +5842,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: Low latency (1 prefill, 9 decode, TEP)
     - spec-decoding: "mtp"
       conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
-      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5856,7 +5856,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput TEP (1 prefill, 6 decode)
     - spec-decoding: "mtp"
       conc-list: [512, 1024, 2048]
-      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5870,7 +5870,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [128, 256, 512, 1024, 2048]
-      recipe: "dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5887,7 +5887,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: Low latency TEP (1 prefill, 7 decode)
     - spec-decoding: "none"
       conc-list: [1, 4, 8]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5901,7 +5901,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (1 prefill, 6 decode)
     - spec-decoding: "none"
       conc-list: [4, 8, 16]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5915,7 +5915,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (1 prefill, 3 decode)
     - spec-decoding: "none"
       conc-list: [8, 16, 32]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5929,7 +5929,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: TEP (2 prefill, 3 decode)
     - spec-decoding: "none"
       conc-list: [32, 64, 128]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -5943,7 +5943,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # STP: High throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "none"
       conc-list: [64, 128, 256]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5957,7 +5957,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: Low latency TEP (1 prefill, 7 decode)
     - spec-decoding: "mtp"
       conc-list: [1, 4, 8]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5971,7 +5971,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (1 prefill, 6 decode)
     - spec-decoding: "mtp"
       conc-list: [2, 4, 8, 16, 32]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5985,7 +5985,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (1 prefill, 3 decode)
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 32, 64]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -5999,7 +5999,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: TEP (2 prefill, 3 decode)
     - spec-decoding: "mtp"
       conc-list: [32, 64, 128]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -6013,7 +6013,7 @@ dsr1-fp8-h200-dynamo-sglang:
     # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention)
     - spec-decoding: "mtp"
       conc-list: [32, 64, 128, 256, 512]
-      recipe: "dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml"
+      recipe: "dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6040,7 +6040,7 @@ dsr1-fp4-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [16, 128]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6052,7 +6052,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [32, 64, 256]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6064,7 +6064,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [512]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6076,7 +6076,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6092,7 +6092,7 @@ dsr1-fp4-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [64, 128]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6104,7 +6104,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [8]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6116,7 +6116,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [4, 128]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_stp_lowlat[2]"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_stp_lowlat[2]"
       prefill:
         num-worker: 2
         tp: 4
@@ -6128,7 +6128,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [4, 8, 16, 64]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_tp4"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:override_stp_tp4"
       prefill:
         num-worker: 1
         tp: 4
@@ -6140,7 +6140,7 @@ dsr1-fp4-b200-dynamo-sglang:
         ep: 1
         dp-attn: false
     - conc-list: [1024, 2048]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_stp_maxtpt_7p2d"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:override_stp_maxtpt_7p2d"
       prefill:
         num-worker: 7
         tp: 4
@@ -6167,7 +6167,7 @@ dsr1-fp8-b200-dynamo-sglang:
     search-space:
     # Non-MTP configurations
     - conc-list: [4]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6179,7 +6179,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [16, 32, 64, 128, 256]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6191,7 +6191,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: false
     - conc-list: [1024, 2048, 4096]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6203,7 +6203,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [2048, 4096]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_stp_maxtpt[1]"
       prefill:
         num-worker: 2
         tp: 8
@@ -6219,7 +6219,7 @@ dsr1-fp8-b200-dynamo-sglang:
     search-space:
     # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat
     - conc-list: [128]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6231,7 +6231,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 1
         dp-attn: false
     - conc-list: [128]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6243,7 +6243,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 1
         dp-attn: false
     - conc-list: [8, 16, 32, 64, 128]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6256,7 +6256,7 @@ dsr1-fp8-b200-dynamo-sglang:
         dp-attn: false
     # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt
     - conc-list: [288]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6268,7 +6268,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [160, 288]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6280,7 +6280,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [512]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -6292,7 +6292,7 @@ dsr1-fp8-b200-dynamo-sglang:
         ep: 8
         dp-attn: true
     - conc-list: [1024]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6320,7 +6320,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: 1P1D
     - spec-decoding: "mtp"
       conc-list: [4, 64]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6334,7 +6334,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: 1P3D
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 32, 128]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6348,7 +6348,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 1P5D
     - spec-decoding: "mtp"
       conc-list: [512, 4096]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 8
@@ -6362,7 +6362,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 2P5D
     - spec-decoding: "mtp"
       conc-list: [1024, 2048, 4096]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[2]"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[2]"
       prefill:
         num-worker: 2
         tp: 8
@@ -6376,7 +6376,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-tpt: 1P2D
     - spec-decoding: "mtp"
       conc-list: [512, 1024, 2048]
-      recipe: "dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml:override_mtp_maxtpt_1p2d"
+      recipe: "dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml:override_mtp_maxtpt_1p2d"
       prefill:
         num-worker: 1
         tp: 8
@@ -6393,7 +6393,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6406,7 +6406,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [128]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6419,7 +6419,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8, 16, 32, 64, 128]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6433,7 +6433,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
     # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt
     - spec-decoding: "mtp"
       conc-list: [288]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6446,7 +6446,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [160, 288]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6459,7 +6459,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml"
       prefill:
         num-worker: 2
         tp: 8
@@ -6472,7 +6472,7 @@ dsr1-fp8-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [1024]
-      recipe: "dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml"
+      recipe: "dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6499,7 +6499,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [16, 512]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6512,7 +6512,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [32, 64, 256, 512]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6525,7 +6525,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [512, 1024]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[0]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6538,7 +6538,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: true
     - spec-decoding: "mtp"
       conc-list: [512]
-      recipe: "dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]"
+      recipe: "dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml:zip_override_mtp_maxtpt[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6557,7 +6557,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
     search-space:
     - spec-decoding: "mtp"
       conc-list: [64, 128]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[0]"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[0]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6570,7 +6570,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [8]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[1]"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[1]"
       prefill:
         num-worker: 1
         tp: 4
@@ -6583,7 +6583,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4, 128]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[2]"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:zip_override_mtp_lowlat[2]"
       prefill:
         num-worker: 2
         tp: 4
@@ -6596,7 +6596,7 @@ dsr1-fp4-b200-dynamo-sglang-mtp:
         dp-attn: false
     - spec-decoding: "mtp"
       conc-list: [4, 8, 16, 64]
-      recipe: "dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml:override_mtp_tp4"
+      recipe: "dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml:override_mtp_tp4"
       prefill:
         num-worker: 1
         tp: 4
@@ -6623,7 +6623,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
     search-space:
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4, 192, 360, 668 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6635,7 +6635,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 5, 15, 30, 55 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6647,7 +6647,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [ 666 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6659,7 +6659,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6671,7 +6671,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 32
         dp-attn: true
     - conc-list: [ 4301, 6452 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6683,7 +6683,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [ 4301 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -6695,7 +6695,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 4301 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -6712,7 +6712,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
     search-space:
     # Non-MTP configurations (default spec_decoding="none")
     - conc-list: [ 4 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6724,7 +6724,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: false
     - conc-list: [ 156 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6736,7 +6736,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [ 5, 15, 30, 60, 105 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6748,7 +6748,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 4
         dp-attn: false
     - conc-list: [ 333 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml"
       prefill:
         num-worker: 2
         tp: 4
@@ -6760,7 +6760,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 615 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -6772,7 +6772,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 16
         dp-attn: true
     - conc-list: [ 2151 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -6784,7 +6784,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
         ep: 8
         dp-attn: true
     - conc-list: [ 2253 ]
-      recipe: "kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
+      recipe: "kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
       prefill:
         num-worker: 7
         tp: 4
@@ -6810,7 +6810,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
     osl: 1024
     search-space:
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
-      recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6822,7 +6822,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 16
         dp-attn: true
     - conc-list: [4, 8, 16, 32, 64, 128]
-      recipe: "kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6837,7 +6837,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
     osl: 1024
     search-space:
     - conc-list: [4, 8, 16, 32, 128]
-      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml"
       prefill:
         num-worker: 1
         tp: 4
@@ -6849,7 +6849,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 4
         dp-attn: false
     - conc-list: [512, 1024]
-      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 4
@@ -6861,7 +6861,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 16
         dp-attn: true
     - conc-list: [2048]
-      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml"
       prefill:
         num-worker: 5
         tp: 4
@@ -6873,7 +6873,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         ep: 8
         dp-attn: true
     - conc-list: [3072, 4096]
-      recipe: "kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml"
+      recipe: "kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml"
       prefill:
         num-worker: 6
         tp: 4
@@ -6905,7 +6905,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
     # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
     - conc-list: [1, 4, 8, 16, 32, 64]
-      recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6919,7 +6919,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [128, 256, 1024, 2048, 4096]
-      recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6934,7 +6934,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # The 4096 overlap with the 1p1d block gives a crossover point. 8192
     # would saturate 1p1d's prefill, so this topology takes over there.
     - conc-list: [4096, 8192]
-      recipe: "dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6952,7 +6952,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
     # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
     - conc-list: [1, 4, 8, 16, 32, 64]
-      recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml"
       prefill:
         num-worker: 1
         tp: 8
@@ -6965,7 +6965,7 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: false
     # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
     - conc-list: [512, 1024]
-      recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 3
         tp: 8
@@ -6979,7 +6979,7 @@ dsv4-fp4-gb200-dynamo-vllm:
     # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
     # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
     - conc-list: [4096, 8192]
-      recipe: "dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml"
+      recipe: "dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml"
       prefill:
         num-worker: 7
         tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k/1k/disagg/1k1k.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/1k1k/disagg/1k1k.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k/1k/disagg/8k1k.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp4/8k1k/disagg/8k1k.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k/1k/disagg/1k1k.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/1k1k/disagg/1k1k.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_lowlat_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k/1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/max-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k/1k/disagg/stp/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/max-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k/1k/disagg/stp/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/max-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k/1k/disagg/stp/ultra-tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/max_tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k/1k/disagg/stp/mid-curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/low_latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/max_tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k/1k/disagg/stp/mid_curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/low_latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/max_tpt.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k/1k/disagg/stp/mid_curve.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/max.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k/1k/disagg/stp/mid.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/low-latency.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/max.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k/1k/disagg/stp/mid.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k/1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k/1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/mtp/low-latency-1p9d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/bs256-1p6d-tp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k/1k/disagg/stp/low-latency-1p9d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs16-1p3d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs4-1p7d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs64-2p3d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/mtp/bs8-1p6d-mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs128-1p1d-dep.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs16-1p3d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs4-1p7d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs64-2p3d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k/1k/disagg/stp/bs8-1p6d.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k/1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k/1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k/1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k/1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k/1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k/1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k/1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k/1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k/1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k/1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k/1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k/1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k/1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k/1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k/1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k/1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k/1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k/1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k/1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml

From 1ca06965a86a3cddfd142ab1232207134bfa970c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 14:09:32 -0500
Subject: [PATCH 06/16] runners: pin all srt-slurm clones to
 NVIDIA/srt-slurm@52e697d5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the ishandhanani/srt-slurm@sa-submission-q1-2026 fallback in
launch_gb200-nv.sh — every launcher now clones NVIDIA/srt-slurm at the
pinned commit 52e697d (nginx fd-limit fix on origin/main, Apr 2026).
Pinning to a SHA instead of a moving branch keeps benchmark runs
reproducible across upstream churn.

Rename the helper's SRT_BRANCH env var to SRT_REF for accuracy
(it accepts any git ref, not just a branch).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 10 ++++++----
 runners/launch_gb200-nv.sh  | 10 +---------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 92998de27..01ed0657c 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -887,22 +887,24 @@ sanitize_image_filename() {
 #
 # All inputs are env vars (set before calling); all are optional:
 #   SRT_REPO_URL    default https://github.com/NVIDIA/srt-slurm.git
-#   SRT_BRANCH      default sa-submission-q2-2026
+#   SRT_REF         default pinned commit SHA on NVIDIA/srt-slurm; accepts
+#                   any git ref (branch / tag / SHA). Pinning to a SHA keeps
+#                   benchmark runs reproducible across srt-slurm churn.
 #   SRT_REPO_DIR    default srt-slurm (relative to current cwd)
 #   UV_INSTALL_DIR  default $HOME/.local/bin (uv's own default)
 #   UV_VENV_DIR     default .venv (inside the cloned repo)
 clone_and_install_srtctl() {
     local repo_url="${SRT_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}"
-    local branch="${SRT_BRANCH:-sa-submission-q2-2026}"
+    local ref="${SRT_REF:-52e697d595569b1055b3bb436e06408a6f078293}"
     local repo_dir="${SRT_REPO_DIR:-srt-slurm}"
     local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}"
     local uv_venv_dir="${UV_VENV_DIR:-.venv}"
 
-    echo "Cloning ${repo_url}@${branch} into ${repo_dir}..."
+    echo "Cloning ${repo_url}@${ref} into ${repo_dir}..."
     rm -rf "$repo_dir"
     git clone "$repo_url" "$repo_dir"
     cd "$repo_dir" || return 1
-    git checkout "$branch"
+    git checkout "$ref"
 
     echo "Installing uv + srtctl into venv at ${uv_venv_dir}..."
     export UV_INSTALL_DIR="$uv_install_dir"
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index e9c3e62b8..c8c822c6f 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -139,15 +139,7 @@ fi
 # We only clone srt-slurm to install srtctl + pick up its sibling configs
 # (configs/, expert-distributions/, etc). The recipe itself is supplied as an
 # absolute CONFIG_FILE pointing at benchmarks/multi_node/srt-slurm-recipes/.
-if [[ $FRAMEWORK == "dynamo-vllm" || ( $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ) ]]; then
-    SRT_REPO_URL=https://github.com/NVIDIA/srt-slurm.git
-    SRT_BRANCH=sa-submission-q2-2026
-else
-    SRT_REPO_URL=https://github.com/ishandhanani/srt-slurm.git
-    SRT_BRANCH=sa-submission-q1-2026
-fi
-SRT_REPO_URL="$SRT_REPO_URL" SRT_BRANCH="$SRT_BRANCH" \
-    clone_and_install_srtctl || exit 1
+clone_and_install_srtctl || exit 1
 
 echo "Configs available at: $SRT_REPO_DIR/"
 

From 0f755d22ad70b90d2b1972f752880e64c9c77262 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 14:11:24 -0500
Subject: [PATCH 07/16] runners: hardcode srt-slurm pin in benchmark_lib helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop env-var override for SRT_REPO_URL / SRT_REF — every benchmark run
must use the same pinned srtctl, no ad-hoc overrides at the call site.
Bumping the pin is now a one-line edit to benchmark_lib.sh.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 01ed0657c..ec2d5a4f1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -885,17 +885,16 @@ sanitize_image_filename() {
 # successfully, cwd is the cloned repo and the venv is active. Idempotent on
 # uv: skips re-curl if the binary is already present at $UV_INSTALL_DIR.
 #
-# All inputs are env vars (set before calling); all are optional:
-#   SRT_REPO_URL    default https://github.com/NVIDIA/srt-slurm.git
-#   SRT_REF         default pinned commit SHA on NVIDIA/srt-slurm; accepts
-#                   any git ref (branch / tag / SHA). Pinning to a SHA keeps
-#                   benchmark runs reproducible across srt-slurm churn.
+# The srt-slurm commit is pinned (not env-var overridable) so every benchmark
+# run uses the exact same srtctl. To bump it, edit the `ref=` line below.
+#
+# All other inputs are env vars (set before calling); all are optional:
 #   SRT_REPO_DIR    default srt-slurm (relative to current cwd)
 #   UV_INSTALL_DIR  default $HOME/.local/bin (uv's own default)
 #   UV_VENV_DIR     default .venv (inside the cloned repo)
 clone_and_install_srtctl() {
-    local repo_url="${SRT_REPO_URL:-https://github.com/NVIDIA/srt-slurm.git}"
-    local ref="${SRT_REF:-52e697d595569b1055b3bb436e06408a6f078293}"
+    local repo_url="https://github.com/NVIDIA/srt-slurm.git"
+    local ref="52e697d595569b1055b3bb436e06408a6f078293"
     local repo_dir="${SRT_REPO_DIR:-srt-slurm}"
     local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}"
     local uv_venv_dir="${UV_VENV_DIR:-.venv}"

From 6f99d485e39f5b9d6405e78cb5a9940237ea3459 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 14:35:04 -0500
Subject: [PATCH 08/16] srt-slurm: wire custom-script bench, drop sa-bench
 dependency (proof-of-life)

Stop relying on srt-slurm's bundled `benchmark.type: sa-bench` (which ships
its own copy of bench_serving.py inside the upstream repo) and instead use
`benchmark.type: custom` to run *this* repo's utils/bench_serving against the
already-ready frontend. Avoids dual-maintaining the bench client.

Plumbing:
- benchmarks/multi_node/srt_bench.sh: thin wrapper that mirrors sa-bench's
  per-conc warmup-then-bench loop, writes results to the same
  /logs/sa-bench_isl_<ISL>_osl_<OSL>/results_concurrency_<N>_gpus_<TOT>_ctx_<P>_gen_<D>.json
  layout the launcher result-harvesters already grep, with conc list parsed
  from x-separated env (e.g. "128x256x1024").
- Recipe shape: add `container_mounts: { $INFMAX_WORKSPACE: /infmax-workspace }`
  + replace `benchmark: { type: sa-bench, ... }` with
  `benchmark: { type: custom, command: "bash /infmax-workspace/...", env: {...} }`.

Migrated as proof-of-life:
- dsr1/trtllm/b200-fp4/1k1k mtp ctx1_gen2_dep8_batch64_eplb0_mtp2  (TRT-LLM)
- dsr1/sglang/gb200-fp4/1k1k stp low-latency                       (SGLang)
- dsv4/vllm/gb200-fp4/1k1k stp disagg-gb200-1p1d-dep8-tep8          (vLLM)

Remaining ~360 recipes still use sa-bench; they migrate in a follow-up once
this triplet runs end-to-end on a real cluster.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/CONFIGS.md                    |  28 ++++
 .../1k1k/disagg/stp/low-latency.yaml          |  25 ++-
 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml    |  25 ++-
 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml      |  27 +++-
 benchmarks/multi_node/srt_bench.sh            | 152 ++++++++++++++++++
 5 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100755 benchmarks/multi_node/srt_bench.sh

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
index 46755ef31..eb6841b0c 100644
--- a/.github/configs/CONFIGS.md
+++ b/.github/configs/CONFIGS.md
@@ -73,6 +73,34 @@ search-space:
 - `recipe` is optional: multi-node entries that do *not* go through srt-slurm (e.g. dynamo-sglang aggregated topologies that drive their own bash) leave it unset.
 - Recipes live under `benchmarks/multi_node/srt-slurm-recipes/` organized as `<model>/<framework>/<hw>-<precision>/<isl><osl>/<agg|disagg>/<stp|mtp>/<recipe-name>.yaml` — e.g. `dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml`. A handful of sglang-style files that carry override sections spanning both stp and mtp are parked one level shallower (the trailing `<stp|mtp>/` segment is omitted). The benchmark template resolves `recipe` to an absolute path and passes it to the launcher as `CONFIG_FILE`, so launchers do not see the relative form.
 
+### Custom-script benchmarking
+
+Recipes are migrating from srt-slurm's bundled `benchmark.type: sa-bench` to `benchmark.type: custom` so the benchmark client lives in this repo (`utils/bench_serving/benchmark_serving.py`) instead of being maintained twice. New shape:
+
+```yaml
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
+benchmark:
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-r1-fp4"   # served-model-name advertised by the engine
+    ISL: "1024"
+    OSL: "1024"
+    CONCURRENCIES: "128x256x1024"   # x-separated, looped inside srt_bench.sh
+    REQ_RATE: "inf"
+    IS_DISAGGREGATED: "true"
+    PREFILL_GPUS: "4"               # per prefill worker
+    DECODE_GPUS: "8"                # per decode worker
+    TOTAL_GPUS: "20"                # sum across all workers
+    USE_CHAT_TEMPLATE: "false"      # optional, defaults to true
+```
+
+`benchmarks/multi_node/srt_bench.sh` is a thin wrapper around `utils/bench_serving/benchmark_serving.py` that mirrors sa-bench's per-conc warmup-then-bench loop and writes results to `/logs/sa-bench_isl_<ISL>_osl_<OSL>/results_concurrency_<N>_gpus_<TOT>_ctx_<P>_gen_<D>.json` so the existing launcher result-harvester picks them up unchanged. See the script's header for the full env-var contract.
+
+The `container_mounts` block bind-mounts the host-side `$INFMAX_WORKSPACE` (set by the launcher to `$GITHUB_WORKSPACE`) at `/infmax-workspace` inside srt-slurm's benchmark container, so the wrapper and bench client are reachable at known paths. `srtctl` resolves `$INFMAX_WORKSPACE` via `os.path.expandvars` at submission time.
+
 ## Runners
 
 The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `b200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs.
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
index 8729aa6fd..2f5deea27 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
@@ -108,9 +108,24 @@ backend:
       tensor-parallel-size: 4
       expert-parallel-size: 1
 
+# InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`.
+# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh
+# inside the benchmark container; the host-side workspace is bind-mounted via
+# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for
+# the full env-var contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x32"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    ISL: "1024"
+    OSL: "1024"
+    CONCURRENCIES: "4x8x32"
+    REQ_RATE: "inf"
+    IS_DISAGGREGATED: "true"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "12"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
index d4d9de835..3ca5ffd12 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
@@ -104,12 +104,27 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 2
 
+# InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`.
+# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh
+# inside the benchmark container; the host-side workspace is bind-mounted via
+# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for
+# the full env-var contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1214"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-r1-fp4"
+    ISL: "1024"
+    OSL: "1024"
+    CONCURRENCIES: "1214"
+    REQ_RATE: "inf"
+    IS_DISAGGREGATED: "true"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
index 984c79526..77da875f6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -135,10 +135,25 @@ backend:
       enable-sleep-mode: true
       tokenizer-mode: deepseek_v4
 
+# InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`.
+# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh
+# inside the benchmark container; the host-side workspace is bind-mounted via
+# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for
+# the full env-var contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-V4-Pro"
+    ISL: "1024"
+    OSL: "1024"
+    CONCURRENCIES: "1x4x8x16x32x64"
+    REQ_RATE: "inf"
+    USE_CHAT_TEMPLATE: "false"
+    IS_DISAGGREGATED: "true"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh
new file mode 100755
index 000000000..418895c0f
--- /dev/null
+++ b/benchmarks/multi_node/srt_bench.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+# Drop-in replacement for srt-slurm's bundled `sa-bench` benchmark, wired to
+# this repo's utils/bench_serving/benchmark_serving.py via srt-slurm's
+# `benchmark.type: custom` feature. srt-slurm owns server bring-up; this
+# script runs against the already-ready frontend on the head node, then
+# writes one results JSON per concurrency to a path the launcher's
+# result-harvester recognizes.
+#
+# Required env (set via `benchmark.env` in the recipe yaml):
+#   ISL OSL CONCURRENCIES MODEL_NAME
+#   IS_DISAGGREGATED TOTAL_GPUS PREFILL_GPUS DECODE_GPUS
+#
+# Optional env (defaults shown):
+#   PORT=8000                  frontend port reachable at localhost
+#   REQ_RATE=inf
+#   RANDOM_RANGE_RATIO=0.8
+#   NUM_PROMPTS_MULT=10        prompts per conc = NUM_PROMPTS_MULT * conc
+#   NUM_WARMUP_MULT=2          warmup prompts per conc = NUM_WARMUP_MULT * conc
+#   USE_CHAT_TEMPLATE=true
+#   CUSTOM_TOKENIZER=          (empty: skip --custom-tokenizer)
+#   DATASET_NAME=random
+#   DATASET_PATH=              (only used when DATASET_NAME != random)
+#   TOKENIZER_PATH=$MODEL_PATH (or container path; falls back to $MODEL_NAME)
+#   PORT_HEALTH_PATH=/v1/models
+#
+# The InferenceX repo is bind-mounted into the container at /infmax-workspace
+# (configured by the recipe's `container_mounts` block). This script lives at
+# /infmax-workspace/benchmarks/multi_node/srt_bench.sh and shells out to
+# /infmax-workspace/utils/bench_serving/benchmark_serving.py.
+set -euo pipefail
+
+INFMAX_WS="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}"
+
+require() {
+    for v in "$@"; do
+        if [[ -z "${!v:-}" ]]; then
+            echo "ERROR: required env var '$v' is unset" >&2
+            exit 64
+        fi
+    done
+}
+require ISL OSL CONCURRENCIES MODEL_NAME IS_DISAGGREGATED TOTAL_GPUS
+
+PORT="${PORT:-8000}"
+REQ_RATE="${REQ_RATE:-inf}"
+RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.8}"
+NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}"
+NUM_WARMUP_MULT="${NUM_WARMUP_MULT:-2}"
+USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}"
+CUSTOM_TOKENIZER="${CUSTOM_TOKENIZER:-}"
+DATASET_NAME="${DATASET_NAME:-random}"
+DATASET_PATH="${DATASET_PATH:-}"
+PREFILL_GPUS="${PREFILL_GPUS:-0}"
+DECODE_GPUS="${DECODE_GPUS:-0}"
+
+ENDPOINT="http://localhost:${PORT}"
+RESULT_DIR="/logs/sa-bench_isl_${ISL}_osl_${OSL}"
+mkdir -p "$RESULT_DIR"
+
+BENCH_PY="${INFMAX_WS}/utils/bench_serving/benchmark_serving.py"
+[[ -f "$BENCH_PY" ]] || { echo "ERROR: benchmark_serving.py not found at $BENCH_PY (mount $INFMAX_WS missing?)" >&2; exit 65; }
+
+# Bench-serving deps. The srt-slurm worker container ships most of these but
+# not all (datasets in particular). Reuse system-site-packages so we don't
+# rebuild what's already there.
+ensure_deps() {
+    local deps=(aiohttp numpy pandas datasets Pillow tqdm transformers huggingface_hub)
+    if python3 -c "import aiohttp, numpy, pandas, datasets, PIL, tqdm, transformers, huggingface_hub" 2>/dev/null; then
+        return
+    fi
+    local venv="/tmp/srt-bench-venv"
+    [[ -d "$venv" ]] || python3 -m venv --system-site-packages "$venv"
+    # shellcheck disable=SC1091
+    source "$venv/bin/activate"
+    pip install --quiet "${deps[@]}"
+}
+ensure_deps
+
+# Verify endpoint
+echo "Verifying endpoint at $ENDPOINT ..."
+curl -fsS "${ENDPOINT}/v1/models" >/dev/null || {
+    echo "ERROR: endpoint $ENDPOINT did not respond on /v1/models" >&2
+    exit 66
+}
+
+ulimit -n 65536 2>/dev/null || true
+
+DATASET_ARGS=(--dataset-name "$DATASET_NAME")
+[[ -n "$DATASET_PATH" ]] && DATASET_ARGS+=(--dataset-path "$DATASET_PATH")
+
+RANDOM_LEN_ARGS=()
+if [[ "$DATASET_NAME" == "random" ]]; then
+    RANDOM_LEN_ARGS=(
+        --random-input-len "$ISL"
+        --random-output-len "$OSL"
+        --random-range-ratio "$RANDOM_RANGE_RATIO"
+    )
+fi
+
+CHAT_TEMPLATE_ARGS=()
+[[ "$USE_CHAT_TEMPLATE" == "true" ]] && CHAT_TEMPLATE_ARGS+=(--use-chat-template)
+
+CUSTOM_TOKENIZER_ARGS=()
+[[ -n "$CUSTOM_TOKENIZER" ]] && CUSTOM_TOKENIZER_ARGS+=(--custom-tokenizer "$CUSTOM_TOKENIZER")
+
+# `tokenizer` is required by benchmark_serving.py; pass MODEL_NAME by default
+# (HF will fetch). Recipe can override via TOKENIZER_PATH for a local path.
+TOKENIZER_PATH="${TOKENIZER_PATH:-$MODEL_NAME}"
+
+# Concurrency list is "x"-separated for parity with sa-bench.
+IFS='x' read -r -a CONC_LIST <<< "$CONCURRENCIES"
+
+run_bench() {
+    local conc=$1
+    local n_prompts=$2
+    local request_rate=$3
+    shift 3
+    python3 -u "$BENCH_PY" \
+        --model "$MODEL_NAME" --tokenizer "$TOKENIZER_PATH" \
+        --host localhost --port "$PORT" \
+        --backend dynamo --endpoint /v1/completions \
+        --disable-tqdm \
+        "${DATASET_ARGS[@]}" \
+        --num-prompts "$n_prompts" \
+        "${RANDOM_LEN_ARGS[@]}" \
+        --ignore-eos \
+        --request-rate "$request_rate" \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --max-concurrency "$conc" \
+        --trust-remote-code \
+        "${CHAT_TEMPLATE_ARGS[@]}" \
+        "${CUSTOM_TOKENIZER_ARGS[@]}" \
+        "$@"
+}
+
+for conc in "${CONC_LIST[@]}"; do
+    echo "=== conc=$conc warmup ==="
+    run_bench "$conc" "$((conc * NUM_WARMUP_MULT))" 250 || true
+
+    if [[ "$IS_DISAGGREGATED" == "true" ]]; then
+        result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}_ctx_${PREFILL_GPUS}_gen_${DECODE_GPUS}.json"
+    else
+        result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}.json"
+    fi
+
+    echo "=== conc=$conc bench → $RESULT_DIR/$result_filename ==="
+    run_bench "$conc" "$((conc * NUM_PROMPTS_MULT))" "$REQ_RATE" \
+        --result-dir "$RESULT_DIR" \
+        --result-filename "$result_filename"
+done
+
+echo "Done. Results in $RESULT_DIR."

From 290fcb68749358131fad4416a8eb359c44f038de Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 15:12:00 -0500
Subject: [PATCH 09/16] =?UTF-8?q?srt-slurm:=20simplify=20custom-bench=20pl?=
 =?UTF-8?q?umbing=20=E2=80=94=20drop=20redundant=20recipe=20env?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two pieces, one commit:

1. benchmark_lib.sh's run_benchmark_serving() gains optional pass-throughs
   for --tokenizer / --endpoint / --dataset-name / --dataset-path so the
   multi-node srt_bench.sh wrapper can reuse it instead of forking its own
   command-build. (--request-rate stays hardcoded "inf" — no recipe-level
   override.) ~50 lines of duplicated shell deleted from srt_bench.sh.

2. Recipe `benchmark.env` blocks lose every variable that is already
   exported by .github/workflows/benchmark-multinode-tmpl.yml at the
   workflow step level: MODEL, ISL, OSL, CONC_LIST, DISAGG, RANDOM_RANGE_RATIO.
   Those propagate down through srtctl → srun (default --export=ALL) → pyxis
   into the bench container, so srt_bench.sh reads them directly. Recipes
   now only carry per-recipe topology knobs (PREFILL_GPUS / DECODE_GPUS /
   TOTAL_GPUS — used in the result filename) plus the rare overrides.

Tokenizer is hardcoded to /model — srtctl's RuntimeContext.create
unconditionally bind-mounts the local model dir at that path in every
container, so AutoTokenizer.from_pretrained("/model") loads from the same
files the engine is serving. No HF Hub egress, works for HF-id and alias-
only `model:` values alike, no `TOKENIZER_PATH` knob in recipes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/CONFIGS.md                    |  20 +-
 benchmarks/benchmark_lib.sh                   |  37 +++-
 .../1k1k/disagg/stp/low-latency.yaml          |  15 +-
 .../ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml    |  20 +-
 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml      |  14 +-
 benchmarks/multi_node/srt_bench.sh            | 192 ++++++++----------
 6 files changed, 146 insertions(+), 152 deletions(-)

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
index eb6841b0c..302605fbb 100644
--- a/.github/configs/CONFIGS.md
+++ b/.github/configs/CONFIGS.md
@@ -85,19 +85,17 @@ benchmark:
   type: "custom"
   command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
   env:
-    MODEL_NAME: "deepseek-r1-fp4"   # served-model-name advertised by the engine
-    ISL: "1024"
-    OSL: "1024"
-    CONCURRENCIES: "128x256x1024"   # x-separated, looped inside srt_bench.sh
-    REQ_RATE: "inf"
-    IS_DISAGGREGATED: "true"
-    PREFILL_GPUS: "4"               # per prefill worker
-    DECODE_GPUS: "8"                # per decode worker
-    TOTAL_GPUS: "20"                # sum across all workers
-    USE_CHAT_TEMPLATE: "false"      # optional, defaults to true
+    PREFILL_GPUS: "4"               # per prefill worker  (filename component)
+    DECODE_GPUS: "8"                # per decode worker   (filename component)
+    TOTAL_GPUS: "20"                # sum across workers  (filename component)
+    # MODEL_NAME: "..."             # only when server's served-model-name
+                                    # differs from master-yaml's `model:`
+    # USE_CHAT_TEMPLATE: "false"    # only when overriding default (true)
 ```
 
-`benchmarks/multi_node/srt_bench.sh` is a thin wrapper around `utils/bench_serving/benchmark_serving.py` that mirrors sa-bench's per-conc warmup-then-bench loop and writes results to `/logs/sa-bench_isl_<ISL>_osl_<OSL>/results_concurrency_<N>_gpus_<TOT>_ctx_<P>_gen_<D>.json` so the existing launcher result-harvester picks them up unchanged. See the script's header for the full env-var contract.
+`MODEL`, `ISL`, `OSL`, `CONC_LIST`, `DISAGG`, `RANDOM_RANGE_RATIO` are exported by `benchmark-multinode-tmpl.yml` at the workflow step and propagate through the launcher → `srtctl` → `srun` (default `--export=ALL`) → pyxis into the benchmark container, so they don't need to be re-declared in `benchmark.env`. The recipe only carries per-recipe topology knobs (`PREFILL_GPUS`/`DECODE_GPUS`/`TOTAL_GPUS`, used in the result filename) plus the rare overrides (`MODEL_NAME` when the server's served-model-name diverges from `model:`, `USE_CHAT_TEMPLATE: false` for tokenizers that have no chat template, etc.).
+
+`benchmarks/multi_node/srt_bench.sh` is a thin wrapper around `run_benchmark_serving()` in `benchmarks/benchmark_lib.sh` (the same shim every single-node bench script uses). It loops once per concurrency in `$CONC_LIST` and writes results to `/logs/sa-bench_isl_<ISL>_osl_<OSL>/results_concurrency_<N>_gpus_<TOT>_ctx_<P>_gen_<D>.json` so existing launcher result-harvesters pick them up unchanged. Tokenizer is loaded from `/model` — `srtctl`'s `RuntimeContext.create` auto-mounts the model dir at that path in every container, so we don't need any HF Hub egress.
 
 The `container_mounts` block bind-mounts the host-side `$INFMAX_WORKSPACE` (set by the launcher to `$GITHUB_WORKSPACE`) at `/infmax-workspace` inside srt-slurm's benchmark container, so the wrapper and bench client are reachable at known paths. `srtctl` resolves `$INFMAX_WORKSPACE` via `os.path.expandvars` at submission time.
 
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ec2d5a4f1..e42926dde 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -206,6 +206,12 @@ run_benchmark_serving() {
     local dsv4=false
     local trust_remote_code=false
     local server_pid=""
+    # Optional knobs surfaced for the multi-node srt_bench.sh wrapper so it
+    # can use this same command-build instead of forking its own.
+    local endpoint=""
+    local dataset_name="random"
+    local dataset_path=""
+    local tokenizer=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -270,6 +276,22 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
+            --endpoint)
+                endpoint="$2"
+                shift 2
+                ;;
+            --dataset-name)
+                dataset_name="$2"
+                shift 2
+                ;;
+            --dataset-path)
+                dataset_path="$2"
+                shift 2
+                ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -341,7 +363,7 @@ run_benchmark_serving() {
         --model "$model"
         --backend "$backend"
         --base-url "http://0.0.0.0:$port"
-        --dataset-name random
+        --dataset-name "$dataset_name"
         --random-input-len "$input_len"
         --random-output-len "$output_len"
         --random-range-ratio "$random_range_ratio"
@@ -356,7 +378,18 @@ run_benchmark_serving() {
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
     )
-    
+
+    # Optional pass-throughs.
+    if [[ -n "$endpoint" ]]; then
+        benchmark_cmd+=(--endpoint "$endpoint")
+    fi
+    if [[ -n "$dataset_path" ]]; then
+        benchmark_cmd+=(--dataset-path "$dataset_path")
+    fi
+    if [[ -n "$tokenizer" ]]; then
+        benchmark_cmd+=(--tokenizer "$tokenizer")
+    fi
+
     # Add --use-chat-template if requested
     if [[ "$use_chat_template" == true ]]; then
         benchmark_cmd+=(--use-chat-template)
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
index 2f5deea27..b280e7176 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/low-latency.yaml
@@ -109,10 +109,10 @@ backend:
       expert-parallel-size: 1
 
 # InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`.
-# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh
-# inside the benchmark container; the host-side workspace is bind-mounted via
-# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for
-# the full env-var contract.
+# Most env (MODEL, ISL, OSL, CONC_LIST, DISAGG) is exported by
+# benchmark-multinode-tmpl.yml and propagated through srtctl → srun → pyxis,
+# so the recipe only carries per-recipe knobs that have no workflow source.
+# See benchmarks/multi_node/srt_bench.sh for the full env contract.
 container_mounts:
   "$INFMAX_WORKSPACE": "/infmax-workspace"
 
@@ -120,12 +120,9 @@ benchmark:
   type: "custom"
   command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
   env:
+    # Override $MODEL because this sglang recipe advertises a different
+    # served-model-name from what master-yaml's `model:` field is set to.
     MODEL_NAME: "deepseek-ai/DeepSeek-R1"
-    ISL: "1024"
-    OSL: "1024"
-    CONCURRENCIES: "4x8x32"
-    REQ_RATE: "inf"
-    IS_DISAGGREGATED: "true"
     PREFILL_GPUS: "4"
     DECODE_GPUS: "4"
     TOTAL_GPUS: "12"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
index 3ca5ffd12..7e59b1617 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml
@@ -105,10 +105,10 @@ backend:
         num_nextn_predict_layers: 2
 
 # InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`.
-# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh
-# inside the benchmark container; the host-side workspace is bind-mounted via
-# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for
-# the full env-var contract.
+# Most env (MODEL, ISL, OSL, CONC_LIST, DISAGG) is exported by
+# benchmark-multinode-tmpl.yml and propagated through srtctl → srun → pyxis,
+# so the recipe only carries per-recipe knobs that have no workflow source.
+# See benchmarks/multi_node/srt_bench.sh for the full env contract.
 container_mounts:
   "$INFMAX_WORKSPACE": "/infmax-workspace"
 
@@ -116,15 +116,9 @@ benchmark:
   type: "custom"
   command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
   env:
-    MODEL_NAME: "deepseek-r1-fp4"
-    ISL: "1024"
-    OSL: "1024"
-    CONCURRENCIES: "1214"
-    REQ_RATE: "inf"
-    IS_DISAGGREGATED: "true"
-    PREFILL_GPUS: "4"
-    DECODE_GPUS: "8"
-    TOTAL_GPUS: "20"
+    PREFILL_GPUS: "4"   # per prefill worker
+    DECODE_GPUS: "8"    # per decode worker
+    TOTAL_GPUS: "20"    # sum across all workers
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
index 77da875f6..15790d70f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -136,10 +136,10 @@ backend:
       tokenizer-mode: deepseek_v4
 
 # InferenceX bench-serving wrapper, invoked via srt-slurm `benchmark.type: custom`.
-# srt_bench.sh lives at /infmax-workspace/benchmarks/multi_node/srt_bench.sh
-# inside the benchmark container; the host-side workspace is bind-mounted via
-# the container_mounts block below. See benchmarks/multi_node/srt_bench.sh for
-# the full env-var contract.
+# Most env (MODEL, ISL, OSL, CONC_LIST, DISAGG) is exported by
+# benchmark-multinode-tmpl.yml and propagated through srtctl → srun → pyxis,
+# so the recipe only carries per-recipe knobs that have no workflow source.
+# See benchmarks/multi_node/srt_bench.sh for the full env contract.
 container_mounts:
   "$INFMAX_WORKSPACE": "/infmax-workspace"
 
@@ -147,13 +147,7 @@ benchmark:
   type: "custom"
   command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
   env:
-    MODEL_NAME: "deepseek-ai/DeepSeek-V4-Pro"
-    ISL: "1024"
-    OSL: "1024"
-    CONCURRENCIES: "1x4x8x16x32x64"
-    REQ_RATE: "inf"
     USE_CHAT_TEMPLATE: "false"
-    IS_DISAGGREGATED: "true"
     PREFILL_GPUS: "8"
     DECODE_GPUS: "8"
     TOTAL_GPUS: "16"
diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh
index 418895c0f..9e82a08cb 100755
--- a/benchmarks/multi_node/srt_bench.sh
+++ b/benchmarks/multi_node/srt_bench.sh
@@ -1,69 +1,79 @@
 #!/usr/bin/env bash
-# Drop-in replacement for srt-slurm's bundled `sa-bench` benchmark, wired to
-# this repo's utils/bench_serving/benchmark_serving.py via srt-slurm's
-# `benchmark.type: custom` feature. srt-slurm owns server bring-up; this
-# script runs against the already-ready frontend on the head node, then
-# writes one results JSON per concurrency to a path the launcher's
-# result-harvester recognizes.
+# Multi-node bench-serving wrapper invoked by srt-slurm via
+# `benchmark.type: custom`. srt-slurm owns server bring-up; this script runs
+# inside the same job's benchmark container against the already-ready
+# frontend on the head node, then writes one results JSON per concurrency to
+# /logs/sa-bench_isl_<ISL>_osl_<OSL>/ — the same path the launcher's existing
+# result-harvesters glob.
 #
-# Required env (set via `benchmark.env` in the recipe yaml):
-#   ISL OSL CONCURRENCIES MODEL_NAME
-#   IS_DISAGGREGATED TOTAL_GPUS PREFILL_GPUS DECODE_GPUS
+# This is a thin loop on top of run_benchmark_serving() in benchmark_lib.sh
+# (the same shim every single-node bench script uses), so any future change
+# to bench-serving CLI conventions, profiling, server-health monitoring, etc.
+# applies here automatically.
 #
-# Optional env (defaults shown):
+# Reads from env. Most of these are *already* exported by
+# .github/workflows/benchmark-multinode-tmpl.yml at the workflow step level
+# and propagate down through the launcher → srtctl → srun (default
+# --export=ALL) → pyxis → bench container, so recipes do not need to
+# re-declare them in `benchmark.env`:
+#
+#   $MODEL              served-model-name; matches workflow `inputs.model`
+#   $ISL $OSL           sequence lengths
+#   $CONC_LIST          space-separated concurrency list
+#   $DISAGG             "true" / "false" — disagg vs aggregated
+#   $RANDOM_RANGE_RATIO 0.8 (workflow default)
+#
+# Per-recipe knobs that *do* live in `benchmark.env` (no workflow equivalent):
+#   PREFILL_GPUS        per-prefill-worker GPU count (filename component)
+#   DECODE_GPUS         per-decode-worker GPU count (filename component)
+#   TOTAL_GPUS          sum across all workers (filename component)
+#
+# Optional per-recipe overrides (defaults shown):
+#   MODEL_NAME=$MODEL          override when server's served-model-name differs
+#                              from the master-yaml `model:` field
 #   PORT=8000                  frontend port reachable at localhost
-#   REQ_RATE=inf
-#   RANDOM_RANGE_RATIO=0.8
+#   BACKEND=dynamo
+#   ENDPOINT=/v1/completions
 #   NUM_PROMPTS_MULT=10        prompts per conc = NUM_PROMPTS_MULT * conc
-#   NUM_WARMUP_MULT=2          warmup prompts per conc = NUM_WARMUP_MULT * conc
 #   USE_CHAT_TEMPLATE=true
-#   CUSTOM_TOKENIZER=          (empty: skip --custom-tokenizer)
+#   DSV4=false                 sets the --dsv4 flag (auto-enables chat template)
+#   TRUST_REMOTE_CODE=true
 #   DATASET_NAME=random
-#   DATASET_PATH=              (only used when DATASET_NAME != random)
-#   TOKENIZER_PATH=$MODEL_PATH (or container path; falls back to $MODEL_NAME)
-#   PORT_HEALTH_PATH=/v1/models
+#   DATASET_PATH=              (only meaningful when DATASET_NAME != random)
 #
-# The InferenceX repo is bind-mounted into the container at /infmax-workspace
-# (configured by the recipe's `container_mounts` block). This script lives at
-# /infmax-workspace/benchmarks/multi_node/srt_bench.sh and shells out to
-# /infmax-workspace/utils/bench_serving/benchmark_serving.py.
+# The InferenceX repo is bind-mounted at /infmax-workspace via each recipe's
+# `container_mounts` block. Model files are auto-mounted at /model by srtctl
+# (RuntimeContext.create unconditionally adds the mount when model.path is a
+# local path), so we point --tokenizer at /model to load the tokenizer from
+# the same files the engine is serving — no HF Hub dependency.
 set -euo pipefail
 
 INFMAX_WS="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}"
+# shellcheck disable=SC1091
+source "$INFMAX_WS/benchmarks/benchmark_lib.sh"
 
-require() {
-    for v in "$@"; do
-        if [[ -z "${!v:-}" ]]; then
-            echo "ERROR: required env var '$v' is unset" >&2
-            exit 64
-        fi
-    done
-}
-require ISL OSL CONCURRENCIES MODEL_NAME IS_DISAGGREGATED TOTAL_GPUS
+check_env_vars MODEL ISL OSL CONC_LIST DISAGG \
+               PREFILL_GPUS DECODE_GPUS TOTAL_GPUS
 
+MODEL_NAME="${MODEL_NAME:-$MODEL}"
 PORT="${PORT:-8000}"
-REQ_RATE="${REQ_RATE:-inf}"
+BACKEND="${BACKEND:-dynamo}"
+ENDPOINT="${ENDPOINT:-/v1/completions}"
 RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.8}"
 NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}"
-NUM_WARMUP_MULT="${NUM_WARMUP_MULT:-2}"
 USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}"
-CUSTOM_TOKENIZER="${CUSTOM_TOKENIZER:-}"
+DSV4="${DSV4:-false}"
+TRUST_REMOTE_CODE="${TRUST_REMOTE_CODE:-true}"
 DATASET_NAME="${DATASET_NAME:-random}"
 DATASET_PATH="${DATASET_PATH:-}"
-PREFILL_GPUS="${PREFILL_GPUS:-0}"
-DECODE_GPUS="${DECODE_GPUS:-0}"
 
-ENDPOINT="http://localhost:${PORT}"
 RESULT_DIR="/logs/sa-bench_isl_${ISL}_osl_${OSL}"
 mkdir -p "$RESULT_DIR"
 
-BENCH_PY="${INFMAX_WS}/utils/bench_serving/benchmark_serving.py"
-[[ -f "$BENCH_PY" ]] || { echo "ERROR: benchmark_serving.py not found at $BENCH_PY (mount $INFMAX_WS missing?)" >&2; exit 65; }
-
-# Bench-serving deps. The srt-slurm worker container ships most of these but
-# not all (datasets in particular). Reuse system-site-packages so we don't
-# rebuild what's already there.
-ensure_deps() {
+# srt-slurm worker containers don't always ship bench_serving.py's runtime
+# deps (datasets in particular). Install missing ones into a system-site-
+# packages venv so we don't perturb the framework's own packages.
+ensure_bench_serving_deps() {
     local deps=(aiohttp numpy pandas datasets Pillow tqdm transformers huggingface_hub)
     if python3 -c "import aiohttp, numpy, pandas, datasets, PIL, tqdm, transformers, huggingface_hub" 2>/dev/null; then
         return
@@ -74,79 +84,47 @@ ensure_deps() {
     source "$venv/bin/activate"
     pip install --quiet "${deps[@]}"
 }
-ensure_deps
+ensure_bench_serving_deps
 
-# Verify endpoint
-echo "Verifying endpoint at $ENDPOINT ..."
-curl -fsS "${ENDPOINT}/v1/models" >/dev/null || {
-    echo "ERROR: endpoint $ENDPOINT did not respond on /v1/models" >&2
+curl -fsS "http://localhost:${PORT}/v1/models" >/dev/null || {
+    echo "ERROR: frontend at http://localhost:${PORT} did not respond on /v1/models" >&2
     exit 66
 }
-
 ulimit -n 65536 2>/dev/null || true
 
-DATASET_ARGS=(--dataset-name "$DATASET_NAME")
-[[ -n "$DATASET_PATH" ]] && DATASET_ARGS+=(--dataset-path "$DATASET_PATH")
-
-RANDOM_LEN_ARGS=()
-if [[ "$DATASET_NAME" == "random" ]]; then
-    RANDOM_LEN_ARGS=(
-        --random-input-len "$ISL"
-        --random-output-len "$OSL"
-        --random-range-ratio "$RANDOM_RANGE_RATIO"
-    )
-fi
-
-CHAT_TEMPLATE_ARGS=()
-[[ "$USE_CHAT_TEMPLATE" == "true" ]] && CHAT_TEMPLATE_ARGS+=(--use-chat-template)
+# CONC_LIST from the workflow is space-separated; bench loops one run per value.
+read -r -a CONC_LIST_ARR <<< "$CONC_LIST"
 
-CUSTOM_TOKENIZER_ARGS=()
-[[ -n "$CUSTOM_TOKENIZER" ]] && CUSTOM_TOKENIZER_ARGS+=(--custom-tokenizer "$CUSTOM_TOKENIZER")
-
-# `tokenizer` is required by benchmark_serving.py; pass MODEL_NAME by default
-# (HF will fetch). Recipe can override via TOKENIZER_PATH for a local path.
-TOKENIZER_PATH="${TOKENIZER_PATH:-$MODEL_NAME}"
-
-# Concurrency list is "x"-separated for parity with sa-bench.
-IFS='x' read -r -a CONC_LIST <<< "$CONCURRENCIES"
-
-run_bench() {
-    local conc=$1
-    local n_prompts=$2
-    local request_rate=$3
-    shift 3
-    python3 -u "$BENCH_PY" \
-        --model "$MODEL_NAME" --tokenizer "$TOKENIZER_PATH" \
-        --host localhost --port "$PORT" \
-        --backend dynamo --endpoint /v1/completions \
-        --disable-tqdm \
-        "${DATASET_ARGS[@]}" \
-        --num-prompts "$n_prompts" \
-        "${RANDOM_LEN_ARGS[@]}" \
-        --ignore-eos \
-        --request-rate "$request_rate" \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --max-concurrency "$conc" \
-        --trust-remote-code \
-        "${CHAT_TEMPLATE_ARGS[@]}" \
-        "${CUSTOM_TOKENIZER_ARGS[@]}" \
-        "$@"
-}
-
-for conc in "${CONC_LIST[@]}"; do
-    echo "=== conc=$conc warmup ==="
-    run_bench "$conc" "$((conc * NUM_WARMUP_MULT))" 250 || true
-
-    if [[ "$IS_DISAGGREGATED" == "true" ]]; then
-        result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}_ctx_${PREFILL_GPUS}_gen_${DECODE_GPUS}.json"
+for conc in "${CONC_LIST_ARR[@]}"; do
+    if [[ "$DISAGG" == "true" ]]; then
+        result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}_ctx_${PREFILL_GPUS}_gen_${DECODE_GPUS}"
     else
-        result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}.json"
+        result_filename="results_concurrency_${conc}_gpus_${TOTAL_GPUS}"
     fi
-
-    echo "=== conc=$conc bench → $RESULT_DIR/$result_filename ==="
-    run_bench "$conc" "$((conc * NUM_PROMPTS_MULT))" "$REQ_RATE" \
-        --result-dir "$RESULT_DIR" \
+    echo "=== conc=$conc → $RESULT_DIR/${result_filename}.json ==="
+
+    args=(
+        --model "$MODEL_NAME"
+        --tokenizer /model
+        --port "$PORT"
+        --backend "$BACKEND"
+        --endpoint "$ENDPOINT"
+        --input-len "$ISL"
+        --output-len "$OSL"
+        --random-range-ratio "$RANDOM_RANGE_RATIO"
+        --num-prompts "$((conc * NUM_PROMPTS_MULT))"
+        --max-concurrency "$conc"
+        --dataset-name "$DATASET_NAME"
         --result-filename "$result_filename"
+        --result-dir "$RESULT_DIR"
+        --bench-serving-dir "$INFMAX_WS"
+    )
+    [[ -n "$DATASET_PATH" ]]                && args+=(--dataset-path "$DATASET_PATH")
+    [[ "$USE_CHAT_TEMPLATE" == "true" ]]    && args+=(--use-chat-template)
+    [[ "$DSV4" == "true" ]]                 && args+=(--dsv4)
+    [[ "$TRUST_REMOTE_CODE" == "true" ]]    && args+=(--trust-remote-code)
+
+    run_benchmark_serving "${args[@]}"
 done
 
 echo "Done. Results in $RESULT_DIR."

From adf8a11e095cbb97c25846c4456d6fefc9b339ea Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 15:33:52 -0500
Subject: [PATCH 10/16] srt-slurm: keep run_benchmark_serving pass-throughs to
 just --tokenizer/--endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Walked back the --dataset-name / --dataset-path additions to
run_benchmark_serving — both default cleanly (random / unset) for every
multi-node throughput sweep we run, so the pass-throughs were dead
weight. srt_bench.sh stops setting DATASET_NAME / DATASET_PATH from env.

Kept --tokenizer (srt_bench points it at /model since --model is the
served-model-name alias, not a HF id) and --endpoint (recipes may need
/v1/chat/completions for chat-template-only request paths).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh        | 38 +++++++++++-------------------
 benchmarks/multi_node/srt_bench.sh |  6 -----
 2 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e42926dde..ad53360fa 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -206,12 +206,13 @@ run_benchmark_serving() {
     local dsv4=false
     local trust_remote_code=false
     local server_pid=""
-    # Optional knobs surfaced for the multi-node srt_bench.sh wrapper so it
-    # can use this same command-build instead of forking its own.
-    local endpoint=""
-    local dataset_name="random"
-    local dataset_path=""
+    # Optional --tokenizer / --endpoint pass-throughs for the multi-node
+    # srt_bench.sh. --tokenizer points the bench at the /model auto-mount
+    # (avoids relying on --model being a HF-resolvable id). --endpoint lets
+    # recipes target /v1/chat/completions when chat-template-only request
+    # paths are required.
     local tokenizer=""
+    local endpoint=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -276,22 +277,14 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
-            --endpoint)
-                endpoint="$2"
-                shift 2
-                ;;
-            --dataset-name)
-                dataset_name="$2"
-                shift 2
-                ;;
-            --dataset-path)
-                dataset_path="$2"
-                shift 2
-                ;;
             --tokenizer)
                 tokenizer="$2"
                 shift 2
                 ;;
+            --endpoint)
+                endpoint="$2"
+                shift 2
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -363,7 +356,7 @@ run_benchmark_serving() {
         --model "$model"
         --backend "$backend"
         --base-url "http://0.0.0.0:$port"
-        --dataset-name "$dataset_name"
+        --dataset-name random
         --random-input-len "$input_len"
         --random-output-len "$output_len"
         --random-range-ratio "$random_range_ratio"
@@ -380,15 +373,12 @@ run_benchmark_serving() {
     )
 
     # Optional pass-throughs.
-    if [[ -n "$endpoint" ]]; then
-        benchmark_cmd+=(--endpoint "$endpoint")
-    fi
-    if [[ -n "$dataset_path" ]]; then
-        benchmark_cmd+=(--dataset-path "$dataset_path")
-    fi
     if [[ -n "$tokenizer" ]]; then
         benchmark_cmd+=(--tokenizer "$tokenizer")
     fi
+    if [[ -n "$endpoint" ]]; then
+        benchmark_cmd+=(--endpoint "$endpoint")
+    fi
 
     # Add --use-chat-template if requested
     if [[ "$use_chat_template" == true ]]; then
diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh
index 9e82a08cb..7b165faf2 100755
--- a/benchmarks/multi_node/srt_bench.sh
+++ b/benchmarks/multi_node/srt_bench.sh
@@ -38,8 +38,6 @@
 #   USE_CHAT_TEMPLATE=true
 #   DSV4=false                 sets the --dsv4 flag (auto-enables chat template)
 #   TRUST_REMOTE_CODE=true
-#   DATASET_NAME=random
-#   DATASET_PATH=              (only meaningful when DATASET_NAME != random)
 #
 # The InferenceX repo is bind-mounted at /infmax-workspace via each recipe's
 # `container_mounts` block. Model files are auto-mounted at /model by srtctl
@@ -64,8 +62,6 @@ NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}"
 USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}"
 DSV4="${DSV4:-false}"
 TRUST_REMOTE_CODE="${TRUST_REMOTE_CODE:-true}"
-DATASET_NAME="${DATASET_NAME:-random}"
-DATASET_PATH="${DATASET_PATH:-}"
 
 RESULT_DIR="/logs/sa-bench_isl_${ISL}_osl_${OSL}"
 mkdir -p "$RESULT_DIR"
@@ -114,12 +110,10 @@ for conc in "${CONC_LIST_ARR[@]}"; do
         --random-range-ratio "$RANDOM_RANGE_RATIO"
         --num-prompts "$((conc * NUM_PROMPTS_MULT))"
         --max-concurrency "$conc"
-        --dataset-name "$DATASET_NAME"
         --result-filename "$result_filename"
         --result-dir "$RESULT_DIR"
         --bench-serving-dir "$INFMAX_WS"
     )
-    [[ -n "$DATASET_PATH" ]]                && args+=(--dataset-path "$DATASET_PATH")
     [[ "$USE_CHAT_TEMPLATE" == "true" ]]    && args+=(--use-chat-template)
     [[ "$DSV4" == "true" ]]                 && args+=(--dsv4)
     [[ "$TRUST_REMOTE_CODE" == "true" ]]    && args+=(--trust-remote-code)

From baf8e28ae02efe06ec05f031d9c989358ca8ba1b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 15:50:56 -0500
Subject: [PATCH 11/16] srt-slurm: compress recipe-resolution block in
 benchmark template
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same behavior, fewer lines: collapse the two-step suffix split into a
single ${RECIPE#"${RECIPE%%:*}"} parameter expansion. 12 active lines
become 5. No semantic change — verified parsing for plain paths,
:override, and :zip_override_<name>[N] forms.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/benchmark-multinode-tmpl.yml    | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index b6b6a30f3..a8005096b 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -179,22 +179,14 @@ jobs:
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
           export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }}
-          # Resolve `recipe` (path relative to benchmarks/multi_node/srt-slurm-recipes/,
-          # optionally ending in `:override[N]`) into an absolute CONFIG_FILE for the
-          # launcher. Copy the recipe to a scratch path first so the launcher's
-          # `sed -i` rewrites (job name, health-check timeout, ...) don't mutate the
-          # tracked file in-place between runs.
+          # RECIPE = "<path>[:override[N]]" relative to benchmarks/multi_node/srt-slurm-recipes/.
+          # Copy the file to scratch so the launcher's `sed -i` rewrites don't mutate the
+          # tracked recipe between concurrent runs; preserve any :override suffix verbatim.
           if [[ -n "$RECIPE" ]]; then
-            recipe_path="${RECIPE%%:*}"
-            recipe_suffix=""
-            if [[ "$RECIPE" == *:* ]]; then
-              recipe_suffix=":${RECIPE#*:}"
-            fi
-            src="${GITHUB_WORKSPACE}/benchmarks/multi_node/srt-slurm-recipes/${recipe_path}"
-            scratch_dir="$(mktemp -d)"
-            scratch_recipe="${scratch_dir}/$(basename "$recipe_path")"
-            cp "$src" "$scratch_recipe"
-            export CONFIG_FILE="${scratch_recipe}${recipe_suffix}"
+            src="${GITHUB_WORKSPACE}/benchmarks/multi_node/srt-slurm-recipes/${RECIPE%%:*}"
+            scratch="$(mktemp -d)/$(basename "${RECIPE%%:*}")"
+            cp "$src" "$scratch"
+            export CONFIG_FILE="${scratch}${RECIPE#"${RECIPE%%:*}"}"
           fi
           export IS_MULTINODE=true
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh

From d3e9b932e013178bfdc29b4eb92f0724fa462d0b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 16:07:46 -0500
Subject: [PATCH 12/16] runners: roll srt-slurm pin back one commit to dodge
 nginx ulimit regression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream commit 52e697d (#108 "fix(nginx): raise file descriptor limit for
nginx workers") prepends `ulimit -n 1048576 &&` to the nginx srun command.
On clusters whose container inherits a sub-1M RLIMIT_NOFILE hard limit
from slurmd/PAM, the bash builtin's setrlimit fails with EPERM (raising
the hard rlimit needs CAP_SYS_RESOURCE in the init user namespace, which
pyxis --container-remap-root does not grant). The `&&` short-circuits and
nginx never starts — caught when re-running dsr1-fp4-gb200-dynamo-sglang.

Pin back to 698590e ("feat(config): cluster-wide default_bash_preamble for
ulimits and the like (#104)"), the immediately prior commit, where nginx
runs without the chained ulimit. Bump forward once upstream softens the
ulimit to `|| true` or makes it opt-in.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index ad53360fa..4394b2f32 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -917,7 +917,13 @@ sanitize_image_filename() {
 #   UV_VENV_DIR     default .venv (inside the cloned repo)
 clone_and_install_srtctl() {
     local repo_url="https://github.com/NVIDIA/srt-slurm.git"
-    local ref="52e697d595569b1055b3bb436e06408a6f078293"
+    # 52e697d (#108 fix(nginx): raise file descriptor limit for nginx workers)
+    # adds an unconditional `ulimit -n 1048576 && nginx` chain that fails with
+    # EPERM on clusters whose container RLIMIT_NOFILE hard limit is below 1M
+    # (CAP_SYS_RESOURCE in a user namespace can't raise the hard rlimit past
+    # what was inherited from slurmd/PAM). Pin to the prior commit until
+    # upstream softens that to `|| true` or makes the bump opt-in.
+    local ref="698590e6486b1febb31f8887b240cf84241ca1db"
     local repo_dir="${SRT_REPO_DIR:-srt-slurm}"
     local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}"
     local uv_venv_dir="${UV_VENV_DIR:-.venv}"

From 12410868ab7550f17dc2da95b2089ba45f9deb4b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 16:18:45 -0500
Subject: [PATCH 13/16] runners: bump srt-slurm pin to ishan-rework-nginx
 (425b486)

Replaces the temporary rollback (698590e) with the upstream fix branch.
425b486 is the tip of NVIDIA/srt-slurm's `ishan-rework-nginx`, which makes
the nginx ulimit + nginx.conf `worker_rlimit_nofile` directive opt-in via
a new `frontend.nginx_raise_ulimit` field (default false). Without us
opting in, nginx runs without the EPERM-prone bump that #108 introduced.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 4394b2f32..d76a7439e 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -917,13 +917,11 @@ sanitize_image_filename() {
 #   UV_VENV_DIR     default .venv (inside the cloned repo)
 clone_and_install_srtctl() {
     local repo_url="https://github.com/NVIDIA/srt-slurm.git"
-    # 52e697d (#108 fix(nginx): raise file descriptor limit for nginx workers)
-    # adds an unconditional `ulimit -n 1048576 && nginx` chain that fails with
-    # EPERM on clusters whose container RLIMIT_NOFILE hard limit is below 1M
-    # (CAP_SYS_RESOURCE in a user namespace can't raise the hard rlimit past
-    # what was inherited from slurmd/PAM). Pin to the prior commit until
-    # upstream softens that to `|| true` or makes the bump opt-in.
-    local ref="698590e6486b1febb31f8887b240cf84241ca1db"
+    # Pinned to ishan-rework-nginx tip — gates the nginx ulimit + worker_rlimit_nofile
+    # behind an opt-in `frontend.nginx_raise_ulimit` field (default false). #108's
+    # unconditional `ulimit -n 1048576 && nginx` chain previously crashed clusters
+    # whose container RLIMIT_NOFILE hard limit was below 1M.
+    local ref="425b486ce23c6a68ddb57009998a666c0acd0892"
     local repo_dir="${SRT_REPO_DIR:-srt-slurm}"
     local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}"
     local uv_venv_dir="${UV_VENV_DIR:-.venv}"

From fecd2de2b50b7a98b839b7df528b72a88550b4c6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 16:27:58 -0500
Subject: [PATCH 14/16] srt-slurm: default bench backend to `openai`, drop
 hardcoded /v1/completions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream sa-bench used `--backend dynamo --endpoint /v1/completions`, but
this repo's benchmark_serving.py doesn't have a `dynamo` backend choice
(it has tgi/vllm/lmdeploy/deepspeed-mii/openai/openai-chat/tensorrt-llm/
scalellm/sglang). The dynamo frontend exposes a generic OpenAI-compatible
API regardless of the underlying engine, so `openai` is the right canonical
default. Recipes that need /v1/chat/completions can override via ENDPOINT.

Also drop the unconditional `--endpoint /v1/completions` — bench_serving.py
already defaults to that, and we now only pass --endpoint when ENDPOINT is
non-empty (matches single-node bench scripts that don't pass it at all).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/multi_node/srt_bench.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmarks/multi_node/srt_bench.sh b/benchmarks/multi_node/srt_bench.sh
index 7b165faf2..aeb1ef502 100755
--- a/benchmarks/multi_node/srt_bench.sh
+++ b/benchmarks/multi_node/srt_bench.sh
@@ -32,8 +32,8 @@
 #   MODEL_NAME=$MODEL          override when server's served-model-name differs
 #                              from the master-yaml `model:` field
 #   PORT=8000                  frontend port reachable at localhost
-#   BACKEND=dynamo
-#   ENDPOINT=/v1/completions
+#   BACKEND=openai             generic OpenAI-API; works against the dynamo frontend
+#   ENDPOINT=                  empty -> bench_serving.py default (/v1/completions)
 #   NUM_PROMPTS_MULT=10        prompts per conc = NUM_PROMPTS_MULT * conc
 #   USE_CHAT_TEMPLATE=true
 #   DSV4=false                 sets the --dsv4 flag (auto-enables chat template)
@@ -55,8 +55,11 @@ check_env_vars MODEL ISL OSL CONC_LIST DISAGG \
 
 MODEL_NAME="${MODEL_NAME:-$MODEL}"
 PORT="${PORT:-8000}"
-BACKEND="${BACKEND:-dynamo}"
-ENDPOINT="${ENDPOINT:-/v1/completions}"
+# `openai` matches every dynamo frontend (frontend exposes a generic OpenAI-
+# compatible API regardless of the underlying engine). Recipes that need
+# /v1/chat/completions can override ENDPOINT.
+BACKEND="${BACKEND:-openai}"
+ENDPOINT="${ENDPOINT:-}"
 RANDOM_RANGE_RATIO="${RANDOM_RANGE_RATIO:-0.8}"
 NUM_PROMPTS_MULT="${NUM_PROMPTS_MULT:-10}"
 USE_CHAT_TEMPLATE="${USE_CHAT_TEMPLATE:-true}"
@@ -104,7 +107,6 @@ for conc in "${CONC_LIST_ARR[@]}"; do
         --tokenizer /model
         --port "$PORT"
         --backend "$BACKEND"
-        --endpoint "$ENDPOINT"
         --input-len "$ISL"
         --output-len "$OSL"
         --random-range-ratio "$RANDOM_RANGE_RATIO"
@@ -114,6 +116,7 @@ for conc in "${CONC_LIST_ARR[@]}"; do
         --result-dir "$RESULT_DIR"
         --bench-serving-dir "$INFMAX_WS"
     )
+    [[ -n "$ENDPOINT" ]]                    && args+=(--endpoint "$ENDPOINT")
     [[ "$USE_CHAT_TEMPLATE" == "true" ]]    && args+=(--use-chat-template)
     [[ "$DSV4" == "true" ]]                 && args+=(--dsv4)
     [[ "$TRUST_REMOTE_CODE" == "true" ]]    && args+=(--trust-remote-code)

From 24d118f7adb0b9fc1910831f5ee30241a3914659 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 16:55:13 -0500
Subject: [PATCH 15/16] runners: bump srt-slurm pin to NVIDIA/main@1372a10

Both fixes we wanted are now on origin/main:
  * #110 nginx-rework-ulimit (Ishan): gates the 1M nofile bump behind opt-in
    frontend.nginx_raise_ulimit. Default off, fixes clusters whose container
    RLIMIT_NOFILE hard cap < 1M.
  * #111 (cam): demotes the per-srun command logger.info to logger.debug so
    the 5KB fingerprint heredoc stops dominating orchestrator logs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index d76a7439e..e1d94b1a6 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -917,11 +917,12 @@ sanitize_image_filename() {
 #   UV_VENV_DIR     default .venv (inside the cloned repo)
 clone_and_install_srtctl() {
     local repo_url="https://github.com/NVIDIA/srt-slurm.git"
-    # Pinned to ishan-rework-nginx tip — gates the nginx ulimit + worker_rlimit_nofile
-    # behind an opt-in `frontend.nginx_raise_ulimit` field (default false). #108's
-    # unconditional `ulimit -n 1048576 && nginx` chain previously crashed clusters
-    # whose container RLIMIT_NOFILE hard limit was below 1M.
-    local ref="425b486ce23c6a68ddb57009998a666c0acd0892"
+    # Pinned to NVIDIA/srt-slurm@main — currently 1372a10. Includes:
+    #   * #110 nginx-rework-ulimit: gates `ulimit -n 1048576` + worker_rlimit_nofile
+    #     behind opt-in `frontend.nginx_raise_ulimit` (we don't opt in).
+    #   * #111 srun command line log demoted INFO -> DEBUG (5KB fingerprint
+    #     heredoc no longer dominates orchestrator log).
+    local ref="1372a10c493e3fd757f342d8516a5a91c30fe6ce"
     local repo_dir="${SRT_REPO_DIR:-srt-slurm}"
     local uv_install_dir="${UV_INSTALL_DIR:-${HOME}/.local/bin}"
     local uv_venv_dir="${UV_VENV_DIR:-.venv}"

From 792d8aa4b8586acb09227d4a04776bb8b956bcd3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 28 Apr 2026 17:11:46 -0500
Subject: [PATCH 16/16] =?UTF-8?q?srt-slurm:=20migrate=20remaining=20364=20?=
 =?UTF-8?q?recipes=20from=20sa-bench=20=E2=86=92=20custom?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the proof-of-life recipe (dsr1-fp4-gb200-dynamo-sglang low-latency,
conc 4/8/32) ran clean end-to-end on a real cluster, sweep the rest of the
tree onto the new shape so all multi-node throughput sweeps drive
utils/bench_serving/benchmark_serving.py via benchmarks/multi_node/srt_bench.sh
instead of srt-slurm's bundled sa-bench client.

Each migrated recipe replaces:

  benchmark:
    type: "sa-bench"
    isl: …
    osl: …
    concurrencies: …
    req_rate: …
    [use_chat_template: false]

with:

  container_mounts:
    "$INFMAX_WORKSPACE": "/infmax-workspace"

  benchmark:
    type: "custom"
    command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
    env:
      [MODEL_NAME: "..."]      # only when server's served-model-name diverges
                               # from the master-yaml `model:` value
      PREFILL_GPUS: "..."      # per prefill worker (filename component)
      DECODE_GPUS: "..."       # per decode worker (filename component)
      TOTAL_GPUS: "..."        # sum across all workers (filename component)
      [USE_CHAT_TEMPLATE: "false"]   # only carried over when set in original

GPU counts derived from each recipe's `resources:` block — uses
gpus_per_prefill / gpus_per_decode when set, else falls back to
nodes * gpus_per_node / workers. MODEL_NAME override added on the 59
sglang recipes whose backend.sglang_config.served-model-name is
"deepseek-ai/DeepSeek-R1" while master-yaml `model:` is the more specific
"deepseek-ai/DeepSeek-R1-0528" / "nvidia/DeepSeek-R1-0528-NVFP4-v2"
revision tag.

Skipped:
  - 3 sglang multi-override base files (1k1k.yaml / 8k1k.yaml under
    dsr1/sglang/b200-fp{4,8}/) — their `benchmark:` lives nested under
    `base:` and gets sa-bench-style overrides per `:override[N]` reference.
    Migrating them needs a separate pass that handles the override-merge
    semantics; their 26 master-yaml refs continue to dispatch via srt-slurm's
    bundled sa-bench until then. Tracked as follow-up.

Validation: schema accepts all 81 master-yaml entries, 149/149 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml     | 17 ++++++++++++-----
 .../8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml     | 17 ++++++++++++-----
 .../gb200-fp4/1k1k/disagg/stp/max-tpt.yaml     | 17 ++++++++++++-----
 .../gb200-fp4/1k1k/disagg/stp/mid-curve.yaml   | 17 ++++++++++++-----
 .../gb200-fp4/8k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++-----
 .../gb200-fp4/8k1k/disagg/stp/max-tpt.yaml     | 17 ++++++++++++-----
 .../gb200-fp4/8k1k/disagg/stp/mid-curve.yaml   | 17 ++++++++++++-----
 .../gb200-fp8/1k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++-----
 .../gb200-fp8/1k1k/disagg/stp/max-tpt.yaml     | 17 ++++++++++++-----
 .../gb200-fp8/1k1k/disagg/stp/mid-curve.yaml   | 17 ++++++++++++-----
 .../gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml   | 17 ++++++++++++-----
 .../gb200-fp8/8k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++-----
 .../gb200-fp8/8k1k/disagg/stp/max_tpt.yaml     | 17 ++++++++++++-----
 .../gb200-fp8/8k1k/disagg/stp/mid-curve.yaml   | 17 ++++++++++++-----
 .../gb300-fp4/1k1k/disagg/stp/low_latency.yaml | 17 ++++++++++++-----
 .../gb300-fp4/1k1k/disagg/stp/max_tpt.yaml     | 17 ++++++++++++-----
 .../gb300-fp4/1k1k/disagg/stp/mid_curve.yaml   | 17 ++++++++++++-----
 .../gb300-fp4/8k1k/disagg/stp/low_latency.yaml | 17 ++++++++++++-----
 .../gb300-fp4/8k1k/disagg/stp/max_tpt.yaml     | 17 ++++++++++++-----
 .../gb300-fp4/8k1k/disagg/stp/mid_curve.yaml   | 17 ++++++++++++-----
 .../gb300-fp8/1k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++-----
 .../sglang/gb300-fp8/1k1k/disagg/stp/max.yaml  | 17 ++++++++++++-----
 .../sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml  | 17 ++++++++++++-----
 .../gb300-fp8/8k1k/disagg/stp/low-latency.yaml | 17 ++++++++++++-----
 .../sglang/gb300-fp8/8k1k/disagg/stp/max.yaml  | 17 ++++++++++++-----
 .../sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml  | 17 ++++++++++++-----
 .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml  | 17 ++++++++++++-----
 .../disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml   | 17 ++++++++++++-----
 .../1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml | 17 ++++++++++++-----
 .../1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml  | 17 ++++++++++++-----
 .../disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml  | 17 ++++++++++++-----
 .../disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml   | 17 ++++++++++++-----
 .../8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml | 17 ++++++++++++-----
 .../8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml  | 17 ++++++++++++-----
 .../1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml    | 17 ++++++++++++-----
 .../1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml     | 18 ++++++++++++------
 .../1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml  | 17 ++++++++++++-----
 .../1k1k/disagg/stp/bs256-1p6d-dep.yaml        | 17 ++++++++++++-----
 .../1k1k/disagg/stp/bs256-1p6d-tp.yaml         | 18 ++++++++++++------
 .../1k1k/disagg/stp/low-latency-1p9d.yaml      | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml    | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/bs16-1p3d-mtp.yaml         | 17 ++++++++++++-----
 .../h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml | 17 ++++++++++++-----
 .../8k1k/disagg/mtp/bs64-2p3d-mtp.yaml         | 17 ++++++++++++-----
 .../h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml | 17 ++++++++++++-----
 .../8k1k/disagg/stp/bs128-1p1d-dep.yaml        | 17 ++++++++++++-----
 .../h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml    | 17 ++++++++++++-----
 .../h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml     | 17 ++++++++++++-----
 .../h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml    | 17 ++++++++++++-----
 .../h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml     | 17 ++++++++++++-----
 .../mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml    | 16 +++++++++++-----
 ...tx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml | 16 +++++++++++-----
 ...tx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml | 16 +++++++++++-----
 ...tx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml | 16 +++++++++++-----
 ...ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml  | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml    | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml    | 16 +++++++++++-----
 ...tx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml | 16 +++++++++++-----
 ...ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml  | 16 +++++++++++-----
 ...ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml | 16 +++++++++++-----
 ...tx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml | 16 +++++++++++-----
 .../ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml    | 16 +++++++++++-----
 .../ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml   | 16 +++++++++++-----
 .../ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml    | 16 +++++++++++-----
 .../mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml | 16 +++++++++++-----
 .../ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml  | 16 +++++++++++-----
 ...tx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml | 16 +++++++++++-----
 ...ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml | 16 +++++++++++-----
 ...ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml | 16 +++++++++++-----
 .../ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml | 16 +++++++++++-----
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml  | 16 +++++++++++-----
 .../ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml   | 16 +++++++++++-----
 .../ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml   | 16 +++++++++++-----
 ...ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml   | 16 +++++++++++-----
 .../ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml   | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml    | 16 +++++++++++-----
 ...ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml | 16 +++++++++++-----
 ...tx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml | 16 +++++++++++-----
 .../ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml  | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml    | 16 +++++++++++-----
 ...tx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml | 16 +++++++++++-----
 ...tx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml | 16 +++++++++++-----
 ...ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml | 16 +++++++++++-----
 .../ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml   | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml  | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml  | 16 +++++++++++-----
 ...x2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml | 16 +++++++++++-----
 .../ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml    | 16 +++++++++++-----
 .../ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml   | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml | 16 +++++++++++-----
 .../ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml    | 16 +++++++++++-----
 .../ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml  | 16 +++++++++++-----
 .../ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml  | 16 +++++++++++-----
 .../ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml   | 16 +++++++++++-----
 .../ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml    | 16 +++++++++++-----
 .../ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml  | 16 +++++++++++-----
 .../ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml  | 16 +++++++++++-----
 .../ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml  | 16 +++++++++++-----
 ...ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml | 16 +++++++++++-----
 ...tx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml | 16 +++++++++++-----
 .../ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml  | 16 +++++++++++-----
 .../ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml    | 16 +++++++++++-----
 ...ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml  | 16 +++++++++++-----
 .../ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml    | 16 +++++++++++-----
 ...ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml   | 16 +++++++++++-----
 ...tx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 16 +++++++++++-----
 ...ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml | 16 +++++++++++-----
 ...tx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml | 16 +++++++++++-----
 ...tx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml    | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml   | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml   | 16 +++++++++++-----
 ...x1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml | 16 +++++++++++-----
 ...tx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml | 16 +++++++++++-----
 .../ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml | 16 +++++++++++-----
 ...tx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml | 16 +++++++++++-----
 ...tx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml    | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml   | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml    | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml   | 16 +++++++++++-----
 .../ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml  | 16 +++++++++++-----
 ...ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml | 16 +++++++++++-----
 .../ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml | 16 +++++++++++-----
 .../ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 16 +++++++++++-----
 ...ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml  | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml    | 16 +++++++++++-----
 .../ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml   | 16 +++++++++++-----
 .../ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml | 16 +++++++++++-----
 ...ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml | 16 +++++++++++-----
 ...ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml | 16 +++++++++++-----
 ...tx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml  | 16 +++++++++++-----
 .../ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml | 16 +++++++++++-----
 .../ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml   | 16 +++++++++++-----
 .../ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml | 16 +++++++++++-----
 .../ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml   | 16 +++++++++++-----
 .../ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml   | 16 +++++++++++-----
 .../ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml  | 16 +++++++++++-----
 .../mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml   | 16 +++++++++++-----
 .../ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml    | 16 +++++++++++-----
 ...ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 16 +++++++++++-----
 .../ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml    | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml   | 16 +++++++++++-----
 ...x2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml | 16 +++++++++++-----
 ...ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml | 16 +++++++++++-----
 ...tx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml  | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml    | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml   | 16 +++++++++++-----
 ...x2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml | 16 +++++++++++-----
 ...tx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml | 16 +++++++++++-----
 ...tx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml | 16 +++++++++++-----
 ...x3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml | 16 +++++++++++-----
 ...x10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml    | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml   | 16 +++++++++++-----
 .../ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml | 16 +++++++++++-----
 ...tx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml | 16 +++++++++++-----
 ...ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml    | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml   | 16 +++++++++++-----
 .../ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml   | 16 +++++++++++-----
 ...ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml | 16 +++++++++++-----
 ...ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml | 16 +++++++++++-----
 ...tx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml | 16 +++++++++++-----
 ...tx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml | 16 +++++++++++-----
 .../ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml    | 16 +++++++++++-----
 .../ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml    | 16 +++++++++++-----
 .../mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml   | 16 +++++++++++-----
 .../ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml    | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml    | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml   | 16 +++++++++++-----
 .../mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml    | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml    | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml    | 16 +++++++++++-----
 ...128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 ...c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml  | 16 +++++++++++-----
 ...256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 ...32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 ...c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 ...512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml | 16 +++++++++++-----
 ...c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 ...c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml | 16 +++++++++++-----
 ...128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml   | 16 +++++++++++-----
 ...256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++-----
 .../c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++-----
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml   | 16 +++++++++++-----
 ...c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++-----
 .../c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml | 16 +++++++++++-----
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml  | 16 +++++++++++-----
 ...c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml | 16 +++++++++++-----
 .../c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml | 16 +++++++++++-----
 .../c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml  | 16 +++++++++++-----
 ...128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml   | 16 +++++++++++-----
 ...256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml  | 16 +++++++++++-----
 ...512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml  | 16 +++++++++++-----
 .../stp/disagg-gb200-1p1d-dep8-dep16.yaml      | 18 ++++++++++++------
 .../stp/disagg-gb200-3p1d-dep8-dep16.yaml      | 18 ++++++++++++------
 .../stp/disagg-gb200-1p1d-dep8-tep8.yaml       | 18 ++++++++++++------
 .../stp/disagg-gb200-3p1d-dep8-dep16.yaml      | 18 ++++++++++++------
 .../stp/disagg-gb200-7p1d-dep8-dep16.yaml      | 18 ++++++++++++------
 .../ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...ep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...p4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...ep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...p4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml | 16 +++++++++++-----
 ...ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml | 16 +++++++++++-----
 .../stp/disagg-gb200-1p1d-dep4-dep16.yaml      | 16 +++++++++++-----
 .../stp/disagg-gb200-1p4d-dep4-tep4.yaml       | 16 +++++++++++-----
 .../stp/disagg-gb200-1p4d-dep4-tep4.yaml       | 16 +++++++++++-----
 .../stp/disagg-gb200-3p1d-dep4-dep16.yaml      | 16 +++++++++++-----
 .../stp/disagg-gb200-5p1d-dep4-dep8.yaml       | 16 +++++++++++-----
 .../stp/disagg-gb200-6p1d-dep4-dep16.yaml      | 16 +++++++++++-----
 364 files changed, 4071 insertions(+), 1827 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
index 3c1f465fa..36b78e975 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml
@@ -133,9 +133,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '128'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
index 51671712c..0fed3f9a6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml
@@ -133,9 +133,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '128'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
index 27dbbe30d..e39611a4b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml
@@ -133,9 +133,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: 8x16x32x64x128
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
index e5eefa2d2..78dc57d5a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_0.yaml
@@ -136,9 +136,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '288'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
index fe0cd9a9f..202a10631 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_1.yaml
@@ -136,9 +136,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: 160x288
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
index 7d050ff12..e2a619e29 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_2.yaml
@@ -136,9 +136,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '512'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
index e687ccf84..5e959ca38 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/mtp/8k1k_mtp_maxtpt_3.yaml
@@ -136,9 +136,16 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '1024'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
index 894cef0c7..24d37e3ee 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
@@ -131,9 +131,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '128'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
index c05382ef8..c97d109d9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
@@ -131,9 +131,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '128'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
index 69e36a289..503f1363b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
@@ -131,9 +131,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: 8x16x32x64x128
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
index 9846a1f05..cb8d13717 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml
@@ -132,9 +132,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '288'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
index e4eccdeab..875893e72 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml
@@ -132,9 +132,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: 160x288
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
index c4cc2dd33..1402c1202 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml
@@ -132,9 +132,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '512'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
index 59cbb8197..a689bf0ac 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/b200-fp8/8k1k/disagg/stp/8k1k_stp_maxtpt_3.yaml
@@ -132,9 +132,16 @@ health_check:
   max_attempts: 360
   interval_seconds: 10
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  req_rate: inf
-  concurrencies: '1024'
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml
index 1075c93eb..eb499618e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/max-tpt.yaml
@@ -175,9 +175,16 @@ backend:
       dp-size: 48
       ep-size: 48
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2048x4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "48"
+    TOTAL_GPUS: "64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml
index d8c80dea7..fdfce3821 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/1k1k/disagg/stp/mid-curve.yaml
@@ -174,9 +174,16 @@ backend:
       dp-size: 32
       ep-size: 32
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x2048x4096x8192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml
index 14ebda144..48b044bd3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/low-latency.yaml
@@ -110,9 +110,16 @@ backend:
       expert-parallel-size: 1
       enable-dp-attention: false
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: "4x8"
-  req_rate: 300 
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "20"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml
index cf2759871..cbf43343b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/max-tpt.yaml
@@ -171,9 +171,16 @@ backend:
       dp-size: 32
       ep-size: 32
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2048"
-  req_rate: 700
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "72"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml
index 8380eb5bf..39f9ab7c8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp4/8k1k/disagg/stp/mid-curve.yaml
@@ -171,9 +171,16 @@ backend:
       dp-size: 48
       ep-size: 48
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x2048x4096"
-  req_rate: 700
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "48"
+    TOTAL_GPUS: "72"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml
index 155d1664c..5dc0c0c73 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/low-latency.yaml
@@ -113,9 +113,16 @@ backend:
       disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "8"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml
index 5d3c91794..c7a9e0923 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/max-tpt.yaml
@@ -166,10 +166,17 @@ backend:
 
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1024x2048x4096x6144"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml
index 1f83ed1bd..0de49d6d7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/mid-curve.yaml
@@ -165,10 +165,17 @@ backend:
       disaggregation-transfer-backend: nixl
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1024x2048x4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "48"
+    TOTAL_GPUS: "72"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml
index 08fe2fa90..f335aa042 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/1k1k/disagg/stp/ultra-tpt.yaml
@@ -167,10 +167,17 @@ backend:
 
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml
index 368b03409..94ee5ed1f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/low-latency.yaml
@@ -109,9 +109,16 @@ backend:
       disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: "4x8x16"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml
index f03e34b8d..2865f2e52 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/max_tpt.yaml
@@ -163,9 +163,16 @@ backend:
       disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: "2048x4096x6144"
-  req_rate: "300"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "24"
+    TOTAL_GPUS: "72"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml
index c822d67f3..a1559e71d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb200-fp8/8k1k/disagg/stp/mid-curve.yaml
@@ -162,9 +162,16 @@ backend:
       disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: "512x1024x2048x6144"
-  req_rate: "300"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "72"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml
index 252eafa2b..c531f8446 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/low_latency.yaml
@@ -108,9 +108,16 @@ backend:
       fp4-gemm-backend: "flashinfer_trtllm"
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x32"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "12"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml
index c941651aa..c4a3d6524 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/max_tpt.yaml
@@ -176,9 +176,16 @@ backend:
       dp-size: 48
       ep-size: 48
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x2048x4096x8192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "48"
+    TOTAL_GPUS: "64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml
index 15d3b3930..e6d388906 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/1k1k/disagg/stp/mid_curve.yaml
@@ -174,9 +174,16 @@ backend:
       dp-size: 32
       ep-size: 32
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x2048x4096x8192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml
index d3c61231b..5c95e1ffa 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/low_latency.yaml
@@ -111,9 +111,16 @@ backend:
       fp4-gemm-backend: "flashinfer_trtllm"
       disaggregation-transfer-backend: nixl
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: "4x8x32x64"
-  req_rate: 300 
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "20"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml
index 001311ed7..29a619a6f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/max_tpt.yaml
@@ -171,9 +171,16 @@ backend:
       dp-size: 32
       ep-size: 32
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2048"
-  req_rate: 700
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "72"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml
index 41043ed0d..b4de76bb9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp4/8k1k/disagg/stp/mid_curve.yaml
@@ -171,9 +171,16 @@ backend:
       dp-size: 48
       ep-size: 48
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x2048x4096"
-  req_rate: 700
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "48"
+    TOTAL_GPUS: "72"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml
index 51628e081..57ea3ff5e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/low-latency.yaml
@@ -113,10 +113,17 @@ backend:
       data-parallel-size: 1
       expert-parallel-size: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [4,8,16,32]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "20"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml
index c88a487b8..d27830a5f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/max.yaml
@@ -162,10 +162,17 @@ backend:
       cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
       cuda-graph-max-bs: 1024 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [4096,7168,7680]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml
index ee6690285..507f5607a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/1k1k/disagg/stp/mid.yaml
@@ -161,10 +161,17 @@ backend:
       cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
       cuda-graph-max-bs: 768
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [1024,2048,4096,6144]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml
index 71fd0f889..766ecc632 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/low-latency.yaml
@@ -113,9 +113,16 @@ backend:
       data-parallel-size: 1
       expert-parallel-size: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: [4,8]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "8"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml
index 6d219cc1e..a7da42825 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/max.yaml
@@ -162,10 +162,17 @@ backend:
       cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
       cuda-graph-max-bs: 768
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: [2048,4096]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "24"
+    TOTAL_GPUS: "72"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml
index b085f50f8..6c367ebf3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/gb300-fp8/8k1k/disagg/stp/mid.yaml
@@ -162,10 +162,17 @@ backend:
       cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
       cuda-graph-max-bs: 768
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192 
-  osl: 1024
-  concurrencies: [128,256,512,1024]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "72"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
index 989fc47d1..76f03d343 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
@@ -106,9 +106,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
index 0ce17e8a4..3c6647c24 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/mtp/h100-fp8-1p2d-max-tp-mtp.yaml
@@ -108,9 +108,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
index c47b6c867..dc186726c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
@@ -94,9 +94,16 @@ backend:
       max-running-requests: 64
       cuda-graph-max-bs: 64
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
index 1f7cf9985..1e4b20c13 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/1k1k/disagg/stp/h100-fp8-1p2d-max-tp.yaml
@@ -94,9 +94,16 @@ backend:
       max-running-requests: 128
       cuda-graph-max-bs: 128
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
index 4a0448658..17b87aba7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-dep-mtp.yaml
@@ -108,9 +108,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
index 591556df7..4dbe673c6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/mtp/h100-fp8-1p1d-max-tp-mtp.yaml
@@ -108,9 +108,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
index 6c8a1c956..dc186726c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-dep.yaml
@@ -94,9 +94,16 @@ backend:
       max-running-requests: 64
       cuda-graph-max-bs: 64
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
index 196e781df..120b9270c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h100-fp8/8k1k/disagg/stp/h100-fp8-1p1d-max-tp.yaml
@@ -94,9 +94,16 @@ backend:
       max-running-requests: 128
       cuda-graph-max-bs: 128
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
index 2c6539c93..d9177b2e1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-dep-mtp.yaml
@@ -113,9 +113,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x512x1024x2048"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
index f2fc08020..bbdea98a4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/bs256-1p6d-tp-mtp.yaml
@@ -109,10 +109,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  # concurrencies: "128x256x512"
-  concurrencies: "512x1024x2048"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml
index 5d6e66ebb..2569666c2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/mtp/low-latency-1p9d-mtp.yaml
@@ -108,9 +108,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64x128x256"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml
index 1932dc222..0d098c736 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-dep.yaml
@@ -100,10 +100,17 @@ backend:
       max-running-requests: 512
       cuda-graph-max-bs: 512
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x512x1024x2048"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml
index 05afea199..af5aded2c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/bs256-1p6d-tp.yaml
@@ -99,11 +99,17 @@ backend:
       max-running-requests: 512
       cuda-graph-max-bs: 512
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  # concurrencies: "128x256x512"
-  concurrencies: "512x1024x2048"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml
index e60102aae..9cfc153f2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/1k1k/disagg/stp/low-latency-1p9d.yaml
@@ -98,9 +98,16 @@ backend:
       max-running-requests: 256
       cuda-graph-max-bs: 256
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64x128x256"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
index 4d62e5a04..292289a7e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs128-1p1d-dep-mtp.yaml
@@ -110,9 +110,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "32x64x128x256x512"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml
index 97ea49b9a..76d9f6b1f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs16-1p3d-mtp.yaml
@@ -108,9 +108,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml
index d58d55b1b..01a278260 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs4-1p7d-mtp.yaml
@@ -108,9 +108,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x4x8"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml
index ed1232d16..e426c78ba 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs64-2p3d-mtp.yaml
@@ -110,12 +110,19 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 # benchmark:
 #   type: "gpqa"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml
index 5bd83fa5c..2922ba1df 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/mtp/bs8-1p6d-mtp.yaml
@@ -109,9 +109,16 @@ backend:
       speculative-eagle-topk: 1
       speculative-num-draft-tokens: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2x4x8x16x32"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml
index d131f6b02..e86438436 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs128-1p1d-dep.yaml
@@ -100,10 +100,17 @@ backend:
       max-running-requests: 256
       cuda-graph-max-bs: 256
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "64x128x256"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml
index 576ff2a03..75e36493b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs16-1p3d.yaml
@@ -98,10 +98,17 @@ backend:
       max-running-requests: 32
       cuda-graph-max-bs: 32
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "8x16x32"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml
index 78ce3d5a1..56aa58d11 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs4-1p7d.yaml
@@ -98,10 +98,17 @@ backend:
       max-running-requests: 8
       cuda-graph-max-bs: 8
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x4x8"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml
index 73aaacc30..7c876e3cf 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs64-2p3d.yaml
@@ -100,12 +100,19 @@ backend:
       max-running-requests: 128
       cuda-graph-max-bs: 128
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 # benchmark:
 #   type: "gpqa"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml
index c37c50eea..5eeba8f61 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/sglang/h200-fp8/8k1k/disagg/stp/bs8-1p6d.yaml
@@ -99,10 +99,17 @@ backend:
       max-running-requests: 16
       cuda-graph-max-bs: 16
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    MODEL_NAME: "deepseek-ai/DeepSeek-R1"
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
index 9532b9cc5..6b34b2fb7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "875"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
index 31bf5bf20..4445c953b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
@@ -97,12 +97,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "6"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
index 3a3309f56..b7d1c9260 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
@@ -111,12 +111,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "10x15x25x45x90x180"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
index 90ad2c657..d5def7a35 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml
@@ -105,12 +105,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4968"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
index 31adc6239..dde552b51 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml
@@ -111,12 +111,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "10860"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "32"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
index 6c3e4bf80..275c140a5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -102,12 +102,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
index 56746330e..ae7ba8483 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml
@@ -99,12 +99,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
index 0fde29f21..16961a5e0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml
@@ -97,12 +97,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1365"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
index 4612b7c2c..ac84ded85 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
@@ -91,12 +91,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "6"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
index 53e833b75..930f2520f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml
@@ -112,12 +112,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "10x15x25x45x90x180"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
index 47c2c6e22..d90c6f3b0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/1k1k/disagg/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml
@@ -101,12 +101,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "450"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "52"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
index a1ec4f38d..1017f8feb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml
@@ -101,12 +101,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "90"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
index 48aad03b6..4c919e2e1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml
@@ -108,12 +108,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "66"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
index 559841f73..dec75f377 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
@@ -97,12 +97,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "6"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
index f9d9843f6..1c8582c31 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml
@@ -104,12 +104,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "10x15x30x60"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
index 7e06d12b5..37ab36d1f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml
@@ -105,12 +105,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "548"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
index 96b4d97c5..693c2221c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml
@@ -109,12 +109,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1096x1691"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
index 98229c7bf..ffbc9ae61 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml
@@ -104,12 +104,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "658"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
index 762987f6e..b2c967541 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml
@@ -92,12 +92,18 @@ backend:
       allreduce_strategy: MNNVL
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "6"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
index a03114f95..0f88bb006 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml
@@ -105,12 +105,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "10x15x25x50x100"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
index 4dfe07604..738dd82ea 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml
@@ -100,12 +100,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "370"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "48"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
index 23c2db5d8..22681d23a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml
@@ -103,12 +103,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1606"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
index e94326803..6e233467a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "837"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
index b3c9e1300..99f0ea58f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp4/8k1k/disagg/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml
@@ -99,12 +99,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2222"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   nginx_container: "nginx-sqsh"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
index 8c7cf706d..0fbd25b82 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [1600]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
index dd06e8462..fe3ab4c6c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [1184]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
index d41d81458..ab8b4d1c6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [1024]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
index 3b4193e44..a2665a5a4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [896]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
index de08fe729..057fcbd77 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [8]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "72"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
index 0b67948c3..e42404618 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [256]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "72"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
index a79351e20..042c00923 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [32]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "72"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
index 1814ff355..9ad27278a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [64]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "72"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
index 2e0ac949f..65aeecbfa 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [4096]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
index 47008c9f0..6159a29ad 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [128]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
index aa2d8c6f2..58d800b6a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [32]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
index b9829e22f..0ed6396a0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [4]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
index 56df5bad2..875279c47 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [1920]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
index a412a6419..c277966c4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/1k1k/disagg/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [5152]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
index 2ccfffba7..7f03ae1e3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml
@@ -102,12 +102,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [8]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
index a9ad0a7d9..712a67416 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml
@@ -102,12 +102,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [64]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
index 38b12e6c0..4212abd06 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml
@@ -102,12 +102,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [48]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
index 3b38311b7..f3e356085 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml
@@ -102,12 +102,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [8]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
index 378123831..cda4cecfd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml
@@ -104,12 +104,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [288]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
index a26eaf4f1..1cdb3af76 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml
@@ -104,12 +104,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [224]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
index 3c659d4dc..359073927 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml
@@ -104,12 +104,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [1088]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
index 6c383e60e..7a9a20391 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [128]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
index 7821ab79e..3f93f9140 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [256]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
index 0f2fdd949..ca1c1d60f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml
@@ -96,12 +96,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [1]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
index 305c15124..6b03210e3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [128]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
index 3c64aacf5..38ed548da 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [128]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
index 751bdd585..f086c23c0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [32]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
index cb4c4d8a3..39f1bffd8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [96]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
index db804a6b6..2b787d7f4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b200-fp8/8k1k/disagg/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [640]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
index 36b365a7d..554db4ec4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml
@@ -112,12 +112,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "654"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "10"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
index f2cd900c9..497739ac7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml
@@ -110,12 +110,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "271"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "18"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
index 31bae1596..0fbaeb745 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml
@@ -108,12 +108,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "11"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "42"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
index eeb43290a..2d9df253b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml
@@ -124,12 +124,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "10x20x25x60x120x200"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "42"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
index 7f8b9ae4a..c356b1b19 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml
@@ -114,12 +114,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2342"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
index 98d8ab04d..5735ea337 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml
@@ -115,12 +115,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "8609"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
index a81e980ec..1eed2b318 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml
@@ -116,12 +116,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "12926"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
index 13978a422..7d11fb152 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml
@@ -106,12 +106,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1176"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "18"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
index 5885277d0..458ce824d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -102,12 +102,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "6"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
index 9d73c7308..3e493c98e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
@@ -106,12 +106,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "5x10x15x25"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "22"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
index 92b99de35..adb4a8b79 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml
@@ -121,12 +121,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "60x110x195x395"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "42"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
index 3113744c9..8bd76075a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -109,12 +109,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4405"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
index d74782639..76d4cd780 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml
@@ -114,12 +114,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "8192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "14"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
index 5088b566c..3c0692530 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/1k1k/disagg/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml
@@ -108,12 +108,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4611"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "22"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
index c24f57918..5f522818a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
@@ -114,12 +114,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2198"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
index 7e2ab395a..41f443c22 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml
@@ -112,12 +112,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "52"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "18"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
index 83c7af6ad..ff3bca726 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
@@ -108,12 +108,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "8"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
index 723029b8d..87c3c57b6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
@@ -111,12 +111,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "32"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
index 67e9fc568..3f40345ca 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml
@@ -110,12 +110,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "181"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "14"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
index b0494f78f..a52be413d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml
@@ -113,12 +113,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1197"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
index 5bc38c22a..f515e9aba 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml
@@ -108,12 +108,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "105"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "14"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
index 002aa9e27..7a167eb80 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
@@ -106,12 +106,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "63"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
index 5e8d96a80..36a6268eb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml
@@ -115,12 +115,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
index df7612f99..d184a95d5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml
@@ -103,12 +103,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "12"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "18"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
index b791d44b8..bacd57645 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml
@@ -105,12 +105,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "589"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
index 09b89137c..923b32c05 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml
@@ -113,12 +113,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1093"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
index 0ca0d7692..1173417cc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp4/8k1k/disagg/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml
@@ -113,12 +113,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2048"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
index cfa58f2a3..9e1da3cf3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [3072]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
index 866ccbb8e..d1ccc8b44 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [2560]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
index 4e7600a2c..74802bbc7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [720]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
index a00639e26..4a09efd68 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml
@@ -113,12 +113,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [160]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "68"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
index 62ae3984f..a6cbb9b66 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml
@@ -113,12 +113,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [10]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "68"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
index 957676992..7ccdfa4af 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [11264]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
index f41079a54..fa0675ade 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [2112]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
index 7746b638c..121844730 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [3072]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
index bdaef8f3e..7a7b2e1fe 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [1280]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
index f469bf3bc..0e75f3747 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [10]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "68"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
index b3b2d8740..384ef6e0c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [128]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "68"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
index 36476736b..5fb7781d4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [384]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "68"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
index c9d131239..364b538d6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/1k1k/disagg/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: [16384]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
index 7e806469c..1039c9e2c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [72]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
index c203b724a..89a1abdd3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml
@@ -113,12 +113,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [40]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
index 48773bf14..87ad50002 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml
@@ -113,12 +113,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [5]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
index bba0d5a65..4edbcf88d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml
@@ -113,12 +113,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [20]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
index 9511ede04..7eba0cdd6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [144]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
index 7513770d8..555ec7688 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml
@@ -112,12 +112,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [512]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
index 2852df6c3..8c9160c66 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [64]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
index 68ae8f4dc..54de6c71f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [10]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "68"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
index 1c2977396..4e7808183 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [256]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
index 343b25905..6d6573b24 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [512]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
index 5aa5546ab..dd915b01d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [256]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "52"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
index df8c2831c..1e0375787 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [1075]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
index 9b0df56e9..eb6170f6a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/b300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml
@@ -106,12 +106,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: [3072]
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
index a8f90e9bd..f6cb09bbc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml
@@ -99,12 +99,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "180"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
index be4f29045..aa711f76c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
@@ -103,12 +103,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x12x24x48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
index 5dd8a302b..50a8aa6c4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml
@@ -134,12 +134,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
index 08fc612ec..53fae254f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml
@@ -110,12 +110,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
index 44a05c484..507a15f85 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml
@@ -195,12 +195,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "16130"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
index c353c3df0..24294befe 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml
@@ -95,12 +95,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "666"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
index a62b540d9..67fd9d9a4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -157,12 +157,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
index d56eba13c..57be7c35e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml
@@ -189,12 +189,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "6144"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
index 94a45661b..e8794eae8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -92,12 +92,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
index a93c86f82..e9d59aaab 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
@@ -107,12 +107,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "12x24x48x96x192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
index 9aa57eb46..c752a5600 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml
@@ -128,12 +128,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
index 3501708c2..118580aa9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml
@@ -101,12 +101,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
index 0a88341a1..0ccf95443 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml
@@ -134,12 +134,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "60"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
index b4dd6005d..2854854f2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
@@ -105,12 +105,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x12x24x48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
index 9374538f8..bddcf060e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml
@@ -99,12 +99,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "180"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
index a62e4f24f..eb101a191 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml
@@ -110,12 +110,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
index ee3082fe5..3bf47d0a8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml
@@ -101,12 +101,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "666"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
index 4df408491..7cfee6b2e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml
@@ -128,12 +128,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
index 4b603ad67..a7e491533 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml
@@ -103,12 +103,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "12x44x76"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
index 1ee953844..fa6483998 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -94,12 +94,18 @@ backend:
 
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
index b08791f00..c0d6dc3f3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml
@@ -94,12 +94,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "333"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
index 7f4e9594e..b78f93a10 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml
@@ -97,12 +97,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "60"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
index 059688716..080186d0f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp4/8k1k/disagg/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml
@@ -109,12 +109,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
index ba7f2ff21..6ea81b176 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
@@ -105,12 +105,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['1229']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
index 218b85744..8e5f86356 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml
@@ -99,12 +99,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['615']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
index fe49d8959..a96a862ef 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml
@@ -129,12 +129,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['2151']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
index 25847ed23..449ca1d85 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml
@@ -161,12 +161,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['4301']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
index 62d4be838..e6f72bd07 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml
@@ -98,12 +98,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['9']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
index 47f21d46b..519f5da0c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml
@@ -98,12 +98,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['18']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
index ecb7c92cd..23c1180d5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml
@@ -99,12 +99,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['36']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
index 47b869af5..868c65032 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['2151']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
index d1e3cae50..64f1004f5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['1127']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
index c48edbd5f..05f3d0763 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml
@@ -92,12 +92,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['256']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
index 08139cf82..5fcaf989c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml
@@ -155,12 +155,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['4301']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
index 14b33599c..5f54ed0f7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml
@@ -187,12 +187,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['6144']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
index 2b9250430..801c5214a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml
@@ -92,12 +92,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['3']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
index 160f4c6ca..9c57a2897 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/1k1k/disagg/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml
@@ -93,12 +93,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['27']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
index 8f305ced0..12632ffd1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml
@@ -98,12 +98,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['6']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
index bea950ac7..a80c790f9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml
@@ -98,12 +98,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['15']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
index fbf861990..1f108d424 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml
@@ -97,12 +97,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['90']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
index ea8a7d013..08f63213f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml
@@ -99,12 +99,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['333']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
index 2ad2e727d..982765ae5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml
@@ -105,12 +105,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['666']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
index 95bf6192f..6b286ce2e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml
@@ -98,12 +98,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['333']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
index 35da2b70f..9bc424961 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml
@@ -101,12 +101,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['666']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
index 178a3b7df..0430ce4b1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['63']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
index f33813fd9..d1b526a07 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml
@@ -92,12 +92,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['6']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
index 98aee313b..fdf1e856c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml
@@ -92,12 +92,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['18']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
index 816065639..2dffe83f1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml
@@ -92,12 +92,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['333']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
index f7d87c1b3..ba7c6142f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['615']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
index 27a19e5b8..8675bf58d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml
@@ -93,12 +93,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['666']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
index 634f07cdb..ca9b432d0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb200-fp8/8k1k/disagg/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
@@ -99,12 +99,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['1229']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
index b4434cdda..b3d1dd62a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml
@@ -103,12 +103,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "333"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
index e264a1796..2b9d42408 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml
@@ -198,12 +198,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "3226"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "6"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
index 67c672ffb..c2c4c537a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
@@ -101,12 +101,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
index aab184727..da70d4074 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml
@@ -106,12 +106,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "8x12x24x48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
index 58cbacdf4..12174174c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml
@@ -121,12 +121,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "22"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
index 698989630..502ae7cf2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml
@@ -109,12 +109,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "38"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
index 642aa6c43..cba8a4f64 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -95,12 +95,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
index 44774b6bc..794556055 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml
@@ -109,12 +109,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "12x48x96x192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
index ffc2850fb..8249a5369 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml
@@ -99,12 +99,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
index 28e148d02..5f96315ff 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml
@@ -223,12 +223,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "8192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
index 4d4ffe594..50f4f8f0f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml
@@ -131,12 +131,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "22"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
index de841c92c..9acddc31e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/1k1k/disagg/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml
@@ -104,12 +104,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "38"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
index 7bf2a9332..4d258c289 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml
@@ -105,12 +105,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "666"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
index 09710a97d..c10a8598b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml
@@ -133,12 +133,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
index 61988358c..df0375f0e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml
@@ -113,12 +113,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1127"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "42"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
index f07f607ea..6ce834ce3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml
@@ -104,12 +104,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "33"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
index be9842323..53771a342 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml
@@ -101,12 +101,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
index 5d45c06d3..b2349f421 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml
@@ -104,12 +104,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "12x24"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
index c0c4f66e7..ddd5641a9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml
@@ -101,12 +101,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "180"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
index e719310a4..aaca79561 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "308"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
index 6b6f4a36e..f141a5005 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml
@@ -128,12 +128,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "3228"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
index 42523722e..882083834 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml
@@ -111,12 +111,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
index 34678b650..e4568f7e1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml
@@ -99,12 +99,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "72"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "26"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
index 158dd4ed9..5a6e21737 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml
@@ -95,12 +95,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "5"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
index f2f18332c..4b8ad5a43 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       num_postprocess_workers: 4
       allreduce_strategy: MNNVL
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "12"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
index f380710f8..6f6194a84 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "5x15x30"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "22"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
index 8dbb94ea5..f68b83534 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml
@@ -98,12 +98,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "666"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "46"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
index eba48a69c..db6ae1b3f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp4/8k1k/disagg/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml
@@ -104,12 +104,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1229"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "2"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "34"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
index fd4c842d5..f03320ce7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml
@@ -103,12 +103,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['666']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
index 24cc7fcb2..3783dd563 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml
@@ -99,12 +99,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['180']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
index dd886c1c6..d4cf77025 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['8']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
index 6625fde5d..e6d895550 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['24']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
index 14b8c83ec..f178dc30a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml
@@ -115,12 +115,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['2253']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
index 30335f8e4..562ada512 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml
@@ -101,12 +101,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['564']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
index 5985d197c..87ba559b2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml
@@ -163,12 +163,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['8192']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
index 5d74bf4f0..57803a156 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml
@@ -96,12 +96,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['84']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
index 9b51b74ce..3f3905468 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['4']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
index bc0a9ad4a..6e2ba5e8e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['24']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
index 126e651e1..2580bab99 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml
@@ -109,12 +109,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['2253']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
index f66062760..c7dc2dcdd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml
@@ -97,12 +97,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['1229']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
index 68a326b76..c4613dbb2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml
@@ -157,12 +157,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['8602']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
index 8cd72351d..bdc07bf9d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/1k1k/disagg/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml
@@ -189,12 +189,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: ['12288']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
index 6123b194f..95a1bd02e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml
@@ -107,12 +107,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['1229']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
index 3c61eca96..644b5a20b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['8']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
index 539a3f780..5c7a8ed5c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['24']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
index 49e94caa5..c78705873 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml
@@ -100,12 +100,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['333']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
index e531467ca..e00287de7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml
@@ -115,12 +115,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['1229']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
index fadb3c8c1..162f003e4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml
@@ -103,12 +103,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['666']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
index 30ba58dcd..3a470113e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['4']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
index 091164082..8b14ffd93 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml
@@ -94,12 +94,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['24']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
index de8d408d1..f5994c054 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['36']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
index 70aade3de..fcf7292da 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml
@@ -97,12 +97,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['666']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
index cfe8dead6..ac8d6faa6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml
@@ -95,12 +95,18 @@ backend:
       tensor_parallel_size: 32
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['512']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
index 97745e8c8..e585cc065 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml
@@ -101,12 +101,18 @@ backend:
       tensor_parallel_size: 16
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['1229']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
index 09e23abed..87272ba14 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/gb300-fp8/8k1k/disagg/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml
@@ -125,12 +125,18 @@ backend:
       tensor_parallel_size: 8
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: ['2151']
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
index 104f3b4ab..67da71d3d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml
@@ -92,12 +92,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 2
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '615'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
index 4c41ec82a..766d7fd79 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml
@@ -96,12 +96,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 1
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '1229'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
index c3dc14082..d2e17ac7a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml
@@ -88,12 +88,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '231'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
index 8f3663c94..a48f9c94a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml
@@ -101,12 +101,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '462'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
index bd77671ac..c07b82fad 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml
@@ -87,12 +87,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '60'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
index c1fccbc9d..d64e9777c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -85,12 +85,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '6'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
index 15c71e8d3..077357b39 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -85,12 +85,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '9'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
index 4f261058e..414388c6b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml
@@ -89,12 +89,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '117'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
index 07de7a34d..d49f37947 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -86,12 +86,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '30'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
index 4a55e5ed8..1624bcc3e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml
@@ -84,12 +84,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '924'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
index 2bedf4c23..f632508e1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml
@@ -86,12 +86,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '1845'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
index 1ff9ace49..6cd4b7697 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml
@@ -82,12 +82,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '231'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
index 215e8a6bf..10ab482b3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml
@@ -83,12 +83,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '462'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
index 4281abed2..850acc0da 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml
@@ -81,12 +81,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '60'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
index a0e0005e8..a1d5c9aac 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -79,12 +79,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '6'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
index 6eee90d2d..c3b1144bd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -79,12 +79,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '9'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
index 29e634316..2e972e14b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -80,12 +80,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '30'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
index bb02cdd0a..3dd8f5482 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/1k1k/disagg/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml
@@ -114,12 +114,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 1024
-  osl: 1024
-  concurrencies: '4916'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
index b78cb01af..007d7e4eb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml
@@ -88,12 +88,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '77'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "32"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
index dd0ddda85..ecf82c12b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml
@@ -90,12 +90,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '78'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
index 2f0ef4e90..221dfc3f7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml
@@ -86,12 +86,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '6'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
index be3fc74ce..3b6a18fe6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml
@@ -86,12 +86,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '9'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
index 6a710bbb5..baf2c1e0d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml
@@ -87,12 +87,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '30'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
index 4d746af13..8be542e76 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml
@@ -89,12 +89,18 @@ backend:
       speculative_config:
         decoding_type: MTP
         num_nextn_predict_layers: 3
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '154'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
index 2f630277e..0bf877f96 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml
@@ -89,12 +89,18 @@ backend:
       num_postprocess_workers: 4
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "154"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
index 9081201ba..b68e4f1a5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml
@@ -81,12 +81,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '6'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
index 938fd965c..06b713a32 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml
@@ -89,12 +89,18 @@ backend:
       num_postprocess_workers: 4
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "9"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
index c1eb86c19..030c98654 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml
@@ -89,12 +89,18 @@ backend:
       num_postprocess_workers: 4
 
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "30"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
index 40c84770f..1f882bc75 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h100-fp8/8k1k/disagg/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml
@@ -84,12 +84,18 @@ backend:
         backend: UCX
       stream_interval: 100
       num_postprocess_workers: 4
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: sa-bench
-  isl: 8192
-  osl: 1024
-  concurrencies: '308'
-  req_rate: inf
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "16"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "48"
 frontend:
   type: dynamo
   enable_multiple_frontends: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
index 7c3fc7c0e..230e3a281 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml
@@ -92,12 +92,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "896"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
index 4feb8690d..b66e9d91a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml
@@ -122,12 +122,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "144"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
index 522618223..246c12a61 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "13"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "96"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
index 5be701be2..84c66f292 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml
@@ -92,12 +92,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1024"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
index 6e8464280..898b6b248 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
@@ -92,12 +92,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "352"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "96"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
index 69f96bac7..ff64103a1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
@@ -120,12 +120,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "44"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "96"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
index a7275865f..04d320697 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml
@@ -138,12 +138,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1024"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
index b68aae478..af18c65d3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml
@@ -122,12 +122,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "72"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
index 506a8c580..f0e0f9a58 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml
@@ -92,12 +92,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "88"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "96"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
index 5d910619d..eaa74f374 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml
@@ -167,12 +167,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1152"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
index a11789b29..03de93867 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -132,12 +132,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "144"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
index 554f516e2..0f29aab2f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml
@@ -98,12 +98,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "11"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
index c48eded81..4393dacf8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml
@@ -86,12 +86,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1536"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
index 473753df3..9b2d8fbf5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -132,12 +132,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "288"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
index 80784e19d..ee3a951cf 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -132,12 +132,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "36"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
index 7c695e47f..6356363ac 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml
@@ -167,12 +167,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "3584"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "72"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
index 69d7b8708..ce67bee55 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -132,12 +132,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "576"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
index 0c1828f27..a5522bdad 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/1k1k/disagg/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml
@@ -132,12 +132,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "72"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "80"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
index 3bacea3c6..1ad52f9f3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 2
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
index eaa4536a4..23ad0751a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 2
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
index d84bf05a5..4649032a7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "9"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
index 19fa4c9f0..92ed944df 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 2
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "256"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
index 6eca7fe9d..01616d163 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "160"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
index 6cfd09aad..78cc69344 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "28"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
index ab5a8fa71..607011f5c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 1
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
index 219a6f1b8..02db00cb0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 2
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "64"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
index d8dd374c2..89cefb58e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml
@@ -102,12 +102,18 @@ backend:
         decoding_type: MTP
         num_nextn_predict_layers: 3
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
index b92ecafe9..6f9e2c92e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml
@@ -99,12 +99,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
index 65eddfb81..a7cc5137e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
index f42e7d15d..82064a374 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "9"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
index 5f96d875a..da13164cd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "768"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
index 5f2976b4d..38d63593a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "160"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
index 72974bb20..19ba51ba6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "28"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "64"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
index a7a96394c..3b35f1299 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "32"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
index 2a27575f2..531f573f3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "192"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
index 602646d9c..c8a885d95 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsr1/trtllm/h200-fp8/8k1k/disagg/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml
@@ -96,12 +96,18 @@ backend:
       stream_interval: 100
       num_postprocess_workers: 4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "48"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "56"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
index bf5b441b9..27cc59a91 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -116,10 +116,16 @@ backend:
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x1024x2048x4096"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
+    USE_CHAT_TEMPLATE: "false"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
index 63e9e280c..66a2a5219 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -108,10 +108,16 @@ backend:
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "40"
+    USE_CHAT_TEMPLATE: "false"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
index 0c872e9c4..4eb66b9ba 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -148,10 +148,16 @@ backend:
       enable-sleep-mode: true
       tokenizer-mode: deepseek_v4
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "16"
+    USE_CHAT_TEMPLATE: "false"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
index d6b750bf2..3e6320fc8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -103,10 +103,16 @@ backend:
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x1024"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "40"
+    USE_CHAT_TEMPLATE: "false"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
index 6213373b3..0f5611403 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/dsv4/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -113,10 +113,16 @@ backend:
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "8"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "72"
+    USE_CHAT_TEMPLATE: "false"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
index ce3eff436..49a38528d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -106,12 +106,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "666"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
index 105b84bfd..c83b4c67b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml
@@ -110,12 +110,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
index 9fb194ddc..e5a833580 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml
@@ -198,12 +198,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301x6452"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "12"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
index 5639da411..a56150450 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml
@@ -119,12 +119,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x192x360x668"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
index f9496feb6..ffb109b8d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml
@@ -103,12 +103,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "5x15x30x55"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
index 71b016c4b..f75876142 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
@@ -134,12 +134,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
index 52b75bb4e..7fdf9daea 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/1k1k/disagg/stp/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml
@@ -118,12 +118,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4301"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "32"
+    TOTAL_GPUS: "40"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
index 8c1f0aa82..bbc7627ee 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml
@@ -107,12 +107,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "156"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "20"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
index d4c5086b0..5a0b04c91 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml
@@ -104,12 +104,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "36"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
index 8f6ea063f..90d294ff5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml
@@ -107,12 +107,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "5x15x30x60x105"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
index 4bfaa0e2c..8cc508d5e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml
@@ -105,12 +105,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "333"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "24"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
index d7d51627c..528b0b4f9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml
@@ -107,12 +107,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "615"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
index e8df1179b..d0dbf80f0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml
@@ -136,12 +136,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2151"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
index db1778920..6eb391bba 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/trtllm/gb200-fp4/8k1k/disagg/stp/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml
@@ -119,12 +119,18 @@ backend:
           - cutedsl
           - cuda_core
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2253"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "44"
 
 frontend:
   type: "dynamo"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
index ecdc9233a..c5230d9e5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p1d-dep4-dep16.yaml
@@ -93,9 +93,15 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 512
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "256x512x1024x2048x3072x4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "20"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
index 43167b5f3..0992a5091 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/1k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -90,9 +90,15 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 1024
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "20"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
index 1ab6ca279..5670a9d54 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-1p4d-dep4-tep4.yaml
@@ -90,9 +90,15 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 16
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16x32x128"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "4"
+    TOTAL_GPUS: "20"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
index ca4e9813f..cecacdfd7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-3p1d-dep4-dep16.yaml
@@ -93,9 +93,15 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 256
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x1024"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "28"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
index cd9f94a9d..259db9436 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -93,9 +93,15 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 512
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2048"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "8"
+    TOTAL_GPUS: "28"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml
index 47d3d7ee5..0a26d118d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/kimik2.5/vllm/gb200-fp4/8k1k/disagg/stp/disagg-gb200-6p1d-dep4-dep16.yaml
@@ -93,9 +93,15 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 512
 
+# Bench client lives in this repo; mounted into the bench container at
+# /infmax-workspace. See benchmarks/multi_node/srt_bench.sh for the env contract.
+container_mounts:
+  "$INFMAX_WORKSPACE": "/infmax-workspace"
+
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "3072x4096"
-  req_rate: "inf"
+  type: "custom"
+  command: "bash /infmax-workspace/benchmarks/multi_node/srt_bench.sh"
+  env:
+    PREFILL_GPUS: "4"
+    DECODE_GPUS: "16"
+    TOTAL_GPUS: "40"